diff options
author | David S. Miller <davem@davemloft.net> | 2014-09-29 00:13:17 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-29 00:13:17 -0400 |
commit | a11238ec28d40f56f8b939f6f125694dba3adb70 (patch) | |
tree | 3a13df46a74af91d928dc4ac5150c2815ee42207 | |
parent | 53dfd501819a6e9c3a7d56cac1ddaf03fe90800d (diff) | |
parent | e3118e8359bb7c59555aca60c725106e6d78c5ce (diff) |
Merge branch 'dctcp'
Daniel Borkmann says:
====================
net: tcp: DCTCP congestion control algorithm
This patch series adds support for the DataCenter TCP (DCTCP) congestion
control algorithm. Please see individual patches for the details.
The last patch adds DCTCP as a congestion control module, and previous
ones add needed infrastructure to extend the congestion control framework.
Joint work between Florian Westphal, Daniel Borkmann and Glenn Judd.
v3 -> v2:
- No changes anywhere, just a resend as requested by Dave
- Added Stephen's ACK
v1 -> v2:
- Rebased to latest net-next
- Addressed Eric's feedback, thanks!
- Update stale comment wrt. DCTCP ECN usage
- Don't call INET_ECN_xmit for every packet
- Add dctcp ss/inetdiag support to expose internal stats to userspace
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/dctcp.txt | 43 | ||||
-rw-r--r-- | include/net/tcp.h | 78 | ||||
-rw-r--r-- | include/uapi/linux/inet_diag.h | 13 | ||||
-rw-r--r-- | net/ipv4/Kconfig | 26 | ||||
-rw-r--r-- | net/ipv4/Makefile | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 46 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.c | 344 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 30 | ||||
-rw-r--r-- | net/ipv4/tcp_westwood.c | 28 |
12 files changed, 574 insertions, 78 deletions
diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt new file mode 100644 index 000000000000..0d5dfbc89ec9 --- /dev/null +++ b/Documentation/networking/dctcp.txt | |||
@@ -0,0 +1,43 @@ | |||
1 | DCTCP (DataCenter TCP) | ||
2 | ---------------------- | ||
3 | |||
4 | DCTCP is an enhancement to the TCP congestion control algorithm for data | ||
5 | center networks and leverages Explicit Congestion Notification (ECN) in | ||
6 | the data center network to provide multi-bit feedback to the end hosts. | ||
7 | |||
8 | To enable it on end hosts: | ||
9 | |||
10 | sysctl -w net.ipv4.tcp_congestion_control=dctcp | ||
11 | |||
12 | All switches in the data center network running DCTCP must support ECN | ||
13 | marking and be configured for marking when reaching defined switch buffer | ||
14 | thresholds. The default ECN marking threshold heuristic for DCTCP on | ||
15 | switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, | ||
16 | but might need further careful tweaking. | ||
17 | |||
18 | For more details, see below documents: | ||
19 | |||
20 | Paper: | ||
21 | |||
22 | The algorithm is further described in detail in the following two | ||
23 | SIGCOMM/SIGMETRICS papers: | ||
24 | |||
25 | i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, | ||
26 | Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: | ||
27 | "Data Center TCP (DCTCP)", Data Center Networks session | ||
28 | Proc. ACM SIGCOMM, New Delhi, 2010. | ||
29 | http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf | ||
30 | http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 | ||
31 | |||
32 | ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: | ||
33 | "Analysis of DCTCP: Stability, Convergence, and Fairness" | ||
34 | Proc. ACM SIGMETRICS, San Jose, 2011. | ||
35 | http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf | ||
36 | |||
37 | IETF informational draft: | ||
38 | |||
39 | http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 | ||
40 | |||
41 | DCTCP site: | ||
42 | |||
43 | http://simula.stanford.edu/~alizade/Site/DCTCP.html | ||
diff --git a/include/net/tcp.h b/include/net/tcp.h index 02a9a2c366bf..1f57c5363492 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -733,23 +733,6 @@ struct tcp_skb_cb { | |||
733 | 733 | ||
734 | #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) | 734 | #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) |
735 | 735 | ||
736 | /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set | ||
737 | * | ||
738 | * If we receive a SYN packet with these bits set, it means a network is | ||
739 | * playing bad games with TOS bits. In order to avoid possible false congestion | ||
740 | * notifications, we disable TCP ECN negociation. | ||
741 | */ | ||
742 | static inline void | ||
743 | TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, | ||
744 | struct net *net) | ||
745 | { | ||
746 | const struct tcphdr *th = tcp_hdr(skb); | ||
747 | |||
748 | if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && | ||
749 | INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) | ||
750 | inet_rsk(req)->ecn_ok = 1; | ||
751 | } | ||
752 | |||
753 | /* Due to TSO, an SKB can be composed of multiple actual | 736 | /* Due to TSO, an SKB can be composed of multiple actual |
754 | * packets. To keep these tracked properly, we use this. | 737 | * packets. To keep these tracked properly, we use this. |
755 | */ | 738 | */ |
@@ -780,8 +763,17 @@ enum tcp_ca_event { | |||
780 | CA_EVENT_CWND_RESTART, /* congestion window restart */ | 763 | CA_EVENT_CWND_RESTART, /* congestion window restart */ |
781 | CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ | 764 | CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ |
782 | CA_EVENT_LOSS, /* loss timeout */ | 765 | CA_EVENT_LOSS, /* loss timeout */ |
783 | CA_EVENT_FAST_ACK, /* in sequence ack */ | 766 | CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ |
784 | CA_EVENT_SLOW_ACK, /* other ack */ | 767 | CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ |
768 | CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ | ||
769 | CA_EVENT_NON_DELAYED_ACK, | ||
770 | }; | ||
771 | |||
772 | /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ | ||
773 | enum tcp_ca_ack_event_flags { | ||
774 | CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ | ||
775 | CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ | ||
776 | CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ | ||
785 | }; | 777 | }; |
786 | 778 | ||
787 | /* | 779 | /* |
@@ -791,7 +783,10 @@ enum tcp_ca_event { | |||
791 | #define TCP_CA_MAX 128 | 783 | #define TCP_CA_MAX 128 |
792 | #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) | 784 | #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) |
793 | 785 | ||
786 | /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ | ||
794 | #define TCP_CONG_NON_RESTRICTED 0x1 | 787 | #define TCP_CONG_NON_RESTRICTED 0x1 |
788 | /* Requires ECN/ECT set on all packets */ | ||
789 | #define TCP_CONG_NEEDS_ECN 0x2 | ||
795 | 790 | ||
796 | struct tcp_congestion_ops { | 791 | struct tcp_congestion_ops { |
797 | struct list_head list; | 792 | struct list_head list; |
@@ -810,6 +805,8 @@ struct tcp_congestion_ops { | |||
810 | void (*set_state)(struct sock *sk, u8 new_state); | 805 | void (*set_state)(struct sock *sk, u8 new_state); |
811 | /* call when cwnd event occurs (optional) */ | 806 | /* call when cwnd event occurs (optional) */ |
812 | void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); | 807 | void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); |
808 | /* call when ack arrives (optional) */ | ||
809 | void (*in_ack_event)(struct sock *sk, u32 flags); | ||
813 | /* new value of cwnd after loss (optional) */ | 810 | /* new value of cwnd after loss (optional) */ |
814 | u32 (*undo_cwnd)(struct sock *sk); | 811 | u32 (*undo_cwnd)(struct sock *sk); |
815 | /* hook for packet ack accounting (optional) */ | 812 | /* hook for packet ack accounting (optional) */ |
@@ -824,6 +821,7 @@ struct tcp_congestion_ops { | |||
824 | int tcp_register_congestion_control(struct tcp_congestion_ops *type); | 821 | int tcp_register_congestion_control(struct tcp_congestion_ops *type); |
825 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); | 822 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); |
826 | 823 | ||
824 | void tcp_assign_congestion_control(struct sock *sk); | ||
827 | void tcp_init_congestion_control(struct sock *sk); | 825 | void tcp_init_congestion_control(struct sock *sk); |
828 | void tcp_cleanup_congestion_control(struct sock *sk); | 826 | void tcp_cleanup_congestion_control(struct sock *sk); |
829 | int tcp_set_default_congestion_control(const char *name); | 827 | int tcp_set_default_congestion_control(const char *name); |
@@ -835,11 +833,17 @@ int tcp_set_congestion_control(struct sock *sk, const char *name); | |||
835 | int tcp_slow_start(struct tcp_sock *tp, u32 acked); | 833 | int tcp_slow_start(struct tcp_sock *tp, u32 acked); |
836 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); | 834 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); |
837 | 835 | ||
838 | extern struct tcp_congestion_ops tcp_init_congestion_ops; | ||
839 | u32 tcp_reno_ssthresh(struct sock *sk); | 836 | u32 tcp_reno_ssthresh(struct sock *sk); |
840 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); | 837 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); |
841 | extern struct tcp_congestion_ops tcp_reno; | 838 | extern struct tcp_congestion_ops tcp_reno; |
842 | 839 | ||
840 | static inline bool tcp_ca_needs_ecn(const struct sock *sk) | ||
841 | { | ||
842 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
843 | |||
844 | return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; | ||
845 | } | ||
846 | |||
843 | static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) | 847 | static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) |
844 | { | 848 | { |
845 | struct inet_connection_sock *icsk = inet_csk(sk); | 849 | struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -857,6 +861,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) | |||
857 | icsk->icsk_ca_ops->cwnd_event(sk, event); | 861 | icsk->icsk_ca_ops->cwnd_event(sk, event); |
858 | } | 862 | } |
859 | 863 | ||
864 | /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set | ||
865 | * | ||
866 | * If we receive a SYN packet with these bits set, it means a | ||
867 | * network is playing bad games with TOS bits. In order to | ||
868 | * avoid possible false congestion notifications, we disable | ||
869 | * TCP ECN negociation. | ||
870 | * | ||
871 | * Exception: tcp_ca wants ECN. This is required for DCTCP | ||
872 | * congestion control; it requires setting ECT on all packets, | ||
873 | * including SYN. We inverse the test in this case: If our | ||
874 | * local socket wants ECN, but peer only set ece/cwr (but not | ||
875 | * ECT in IP header) its probably a non-DCTCP aware sender. | ||
876 | */ | ||
877 | static inline void | ||
878 | TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, | ||
879 | const struct sock *listen_sk) | ||
880 | { | ||
881 | const struct tcphdr *th = tcp_hdr(skb); | ||
882 | const struct net *net = sock_net(listen_sk); | ||
883 | bool th_ecn = th->ece && th->cwr; | ||
884 | bool ect, need_ecn; | ||
885 | |||
886 | if (!th_ecn) | ||
887 | return; | ||
888 | |||
889 | ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); | ||
890 | need_ecn = tcp_ca_needs_ecn(listen_sk); | ||
891 | |||
892 | if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) | ||
893 | inet_rsk(req)->ecn_ok = 1; | ||
894 | else if (ect && need_ecn) | ||
895 | inet_rsk(req)->ecn_ok = 1; | ||
896 | } | ||
897 | |||
860 | /* These functions determine how the current flow behaves in respect of SACK | 898 | /* These functions determine how the current flow behaves in respect of SACK |
861 | * handling. SACK is negotiated with the peer, and therefore it can vary | 899 | * handling. SACK is negotiated with the peer, and therefore it can vary |
862 | * between different flows. | 900 | * between different flows. |
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbde90fa5838..d65c0a09efd3 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h | |||
@@ -110,10 +110,10 @@ enum { | |||
110 | INET_DIAG_TCLASS, | 110 | INET_DIAG_TCLASS, |
111 | INET_DIAG_SKMEMINFO, | 111 | INET_DIAG_SKMEMINFO, |
112 | INET_DIAG_SHUTDOWN, | 112 | INET_DIAG_SHUTDOWN, |
113 | INET_DIAG_DCTCPINFO, | ||
113 | }; | 114 | }; |
114 | 115 | ||
115 | #define INET_DIAG_MAX INET_DIAG_SHUTDOWN | 116 | #define INET_DIAG_MAX INET_DIAG_DCTCPINFO |
116 | |||
117 | 117 | ||
118 | /* INET_DIAG_MEM */ | 118 | /* INET_DIAG_MEM */ |
119 | 119 | ||
@@ -133,5 +133,14 @@ struct tcpvegas_info { | |||
133 | __u32 tcpv_minrtt; | 133 | __u32 tcpv_minrtt; |
134 | }; | 134 | }; |
135 | 135 | ||
136 | /* INET_DIAG_DCTCPINFO */ | ||
137 | |||
138 | struct tcp_dctcp_info { | ||
139 | __u16 dctcp_enabled; | ||
140 | __u16 dctcp_ce_state; | ||
141 | __u32 dctcp_alpha; | ||
142 | __u32 dctcp_ab_ecn; | ||
143 | __u32 dctcp_ab_tot; | ||
144 | }; | ||
136 | 145 | ||
137 | #endif /* _UAPI_INET_DIAG_H_ */ | 146 | #endif /* _UAPI_INET_DIAG_H_ */ |
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 84f710b7472a..69fb37854449 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS | |||
570 | For further details see: | 570 | For further details see: |
571 | http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html | 571 | http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html |
572 | 572 | ||
573 | config TCP_CONG_DCTCP | ||
574 | tristate "DataCenter TCP (DCTCP)" | ||
575 | default n | ||
576 | ---help--- | ||
577 | DCTCP leverages Explicit Congestion Notification (ECN) in the network to | ||
578 | provide multi-bit feedback to the end hosts. It is designed to provide: | ||
579 | |||
580 | - High burst tolerance (incast due to partition/aggregate), | ||
581 | - Low latency (short flows, queries), | ||
582 | - High throughput (continuous data updates, large file transfers) with | ||
583 | commodity, shallow-buffered switches. | ||
584 | |||
585 | All switches in the data center network running DCTCP must support | ||
586 | ECN marking and be configured for marking when reaching defined switch | ||
587 | buffer thresholds. The default ECN marking threshold heuristic for | ||
588 | DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets | ||
589 | (~100KB) at 10Gbps, but might need further careful tweaking. | ||
590 | |||
591 | For further details see: | ||
592 | http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf | ||
593 | |||
573 | choice | 594 | choice |
574 | prompt "Default TCP congestion control" | 595 | prompt "Default TCP congestion control" |
575 | default DEFAULT_CUBIC | 596 | default DEFAULT_CUBIC |
@@ -598,9 +619,11 @@ choice | |||
598 | config DEFAULT_WESTWOOD | 619 | config DEFAULT_WESTWOOD |
599 | bool "Westwood" if TCP_CONG_WESTWOOD=y | 620 | bool "Westwood" if TCP_CONG_WESTWOOD=y |
600 | 621 | ||
622 | config DEFAULT_DCTCP | ||
623 | bool "DCTCP" if TCP_CONG_DCTCP=y | ||
624 | |||
601 | config DEFAULT_RENO | 625 | config DEFAULT_RENO |
602 | bool "Reno" | 626 | bool "Reno" |
603 | |||
604 | endchoice | 627 | endchoice |
605 | 628 | ||
606 | endif | 629 | endif |
@@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG | |||
620 | default "westwood" if DEFAULT_WESTWOOD | 643 | default "westwood" if DEFAULT_WESTWOOD |
621 | default "veno" if DEFAULT_VENO | 644 | default "veno" if DEFAULT_VENO |
622 | default "reno" if DEFAULT_RENO | 645 | default "reno" if DEFAULT_RENO |
646 | default "dctcp" if DEFAULT_DCTCP | ||
623 | default "cubic" | 647 | default "cubic" |
624 | 648 | ||
625 | config TCP_MD5SIG | 649 | config TCP_MD5SIG |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d78d404c596f..d8105787c199 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o | |||
43 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o | 43 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o |
44 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 44 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
45 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o | 45 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o |
46 | obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o | ||
46 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | 47 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o |
47 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | 48 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o |
48 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | 49 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87289e51be00..cf5e508e1ef5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk) | |||
405 | 405 | ||
406 | tp->reordering = sysctl_tcp_reordering; | 406 | tp->reordering = sysctl_tcp_reordering; |
407 | tcp_enable_early_retrans(tp); | 407 | tcp_enable_early_retrans(tp); |
408 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; | 408 | tcp_assign_congestion_control(sk); |
409 | 409 | ||
410 | tp->tsoffset = 0; | 410 | tp->tsoffset = 0; |
411 | 411 | ||
@@ -3258,8 +3258,6 @@ void __init tcp_init(void) | |||
3258 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); | 3258 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); |
3259 | 3259 | ||
3260 | tcp_metrics_init(); | 3260 | tcp_metrics_init(); |
3261 | 3261 | BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); | |
3262 | tcp_register_congestion_control(&tcp_reno); | ||
3263 | |||
3264 | tcp_tasklet_init(); | 3262 | tcp_tasklet_init(); |
3265 | } | 3263 | } |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 80248f56c89f..a6c8a5775624 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | |||
74 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | 74 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); |
75 | 75 | ||
76 | /* Assign choice of congestion control. */ | 76 | /* Assign choice of congestion control. */ |
77 | void tcp_init_congestion_control(struct sock *sk) | 77 | void tcp_assign_congestion_control(struct sock *sk) |
78 | { | 78 | { |
79 | struct inet_connection_sock *icsk = inet_csk(sk); | 79 | struct inet_connection_sock *icsk = inet_csk(sk); |
80 | struct tcp_congestion_ops *ca; | 80 | struct tcp_congestion_ops *ca; |
81 | 81 | ||
82 | /* if no choice made yet assign the current value set as default */ | 82 | rcu_read_lock(); |
83 | if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { | 83 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { |
84 | rcu_read_lock(); | 84 | if (likely(try_module_get(ca->owner))) { |
85 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | 85 | icsk->icsk_ca_ops = ca; |
86 | if (try_module_get(ca->owner)) { | 86 | goto out; |
87 | icsk->icsk_ca_ops = ca; | ||
88 | break; | ||
89 | } | ||
90 | |||
91 | /* fallback to next available */ | ||
92 | } | 87 | } |
93 | rcu_read_unlock(); | 88 | /* Fallback to next available. The last really |
89 | * guaranteed fallback is Reno from this list. | ||
90 | */ | ||
94 | } | 91 | } |
92 | out: | ||
93 | rcu_read_unlock(); | ||
94 | |||
95 | /* Clear out private data before diag gets it and | ||
96 | * the ca has not been initialized. | ||
97 | */ | ||
98 | if (ca->get_info) | ||
99 | memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); | ||
100 | } | ||
101 | |||
102 | void tcp_init_congestion_control(struct sock *sk) | ||
103 | { | ||
104 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
95 | 105 | ||
96 | if (icsk->icsk_ca_ops->init) | 106 | if (icsk->icsk_ca_ops->init) |
97 | icsk->icsk_ca_ops->init(sk); | 107 | icsk->icsk_ca_ops->init(sk); |
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = { | |||
345 | .ssthresh = tcp_reno_ssthresh, | 355 | .ssthresh = tcp_reno_ssthresh, |
346 | .cong_avoid = tcp_reno_cong_avoid, | 356 | .cong_avoid = tcp_reno_cong_avoid, |
347 | }; | 357 | }; |
348 | |||
349 | /* Initial congestion control used (until SYN) | ||
350 | * really reno under another name so we can tell difference | ||
351 | * during tcp_set_default_congestion_control | ||
352 | */ | ||
353 | struct tcp_congestion_ops tcp_init_congestion_ops = { | ||
354 | .name = "", | ||
355 | .owner = THIS_MODULE, | ||
356 | .ssthresh = tcp_reno_ssthresh, | ||
357 | .cong_avoid = tcp_reno_cong_avoid, | ||
358 | }; | ||
359 | EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); | ||
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c | |||
@@ -0,0 +1,344 @@ | |||
1 | /* DataCenter TCP (DCTCP) congestion control. | ||
2 | * | ||
3 | * http://simula.stanford.edu/~alizade/Site/DCTCP.html | ||
4 | * | ||
5 | * This is an implementation of DCTCP over Reno, an enhancement to the | ||
6 | * TCP congestion control algorithm designed for data centers. DCTCP | ||
7 | * leverages Explicit Congestion Notification (ECN) in the network to | ||
8 | * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet | ||
9 | * the following three data center transport requirements: | ||
10 | * | ||
11 | * - High burst tolerance (incast due to partition/aggregate) | ||
12 | * - Low latency (short flows, queries) | ||
13 | * - High throughput (continuous data updates, large file transfers) | ||
14 | * with commodity shallow buffered switches | ||
15 | * | ||
16 | * The algorithm is described in detail in the following two papers: | ||
17 | * | ||
18 | * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, | ||
19 | * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: | ||
20 | * "Data Center TCP (DCTCP)", Data Center Networks session | ||
21 | * Proc. ACM SIGCOMM, New Delhi, 2010. | ||
22 | * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf | ||
23 | * | ||
24 | * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: | ||
25 | * "Analysis of DCTCP: Stability, Convergence, and Fairness" | ||
26 | * Proc. ACM SIGMETRICS, San Jose, 2011. | ||
27 | * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf | ||
28 | * | ||
29 | * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. | ||
30 | * | ||
31 | * Authors: | ||
32 | * | ||
33 | * Daniel Borkmann <dborkman@redhat.com> | ||
34 | * Florian Westphal <fw@strlen.de> | ||
35 | * Glenn Judd <glenn.judd@morganstanley.com> | ||
36 | * | ||
37 | * This program is free software; you can redistribute it and/or modify | ||
38 | * it under the terms of the GNU General Public License as published by | ||
39 | * the Free Software Foundation; either version 2 of the License, or (at | ||
40 | * your option) any later version. | ||
41 | */ | ||
42 | |||
43 | #include <linux/module.h> | ||
44 | #include <linux/mm.h> | ||
45 | #include <net/tcp.h> | ||
46 | #include <linux/inet_diag.h> | ||
47 | |||
48 | #define DCTCP_MAX_ALPHA 1024U | ||
49 | |||
50 | struct dctcp { | ||
51 | u32 acked_bytes_ecn; | ||
52 | u32 acked_bytes_total; | ||
53 | u32 prior_snd_una; | ||
54 | u32 prior_rcv_nxt; | ||
55 | u32 dctcp_alpha; | ||
56 | u32 next_seq; | ||
57 | u32 ce_state; | ||
58 | u32 delayed_ack_reserved; | ||
59 | }; | ||
60 | |||
61 | static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ | ||
62 | module_param(dctcp_shift_g, uint, 0644); | ||
63 | MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); | ||
64 | |||
65 | static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; | ||
66 | module_param(dctcp_alpha_on_init, uint, 0644); | ||
67 | MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); | ||
68 | |||
69 | static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; | ||
70 | module_param(dctcp_clamp_alpha_on_loss, uint, 0644); | ||
71 | MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, | ||
72 | "parameter for clamping alpha on loss"); | ||
73 | |||
74 | static struct tcp_congestion_ops dctcp_reno; | ||
75 | |||
76 | static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) | ||
77 | { | ||
78 | ca->next_seq = tp->snd_nxt; | ||
79 | |||
80 | ca->acked_bytes_ecn = 0; | ||
81 | ca->acked_bytes_total = 0; | ||
82 | } | ||
83 | |||
84 | static void dctcp_init(struct sock *sk) | ||
85 | { | ||
86 | const struct tcp_sock *tp = tcp_sk(sk); | ||
87 | |||
88 | if ((tp->ecn_flags & TCP_ECN_OK) || | ||
89 | (sk->sk_state == TCP_LISTEN || | ||
90 | sk->sk_state == TCP_CLOSE)) { | ||
91 | struct dctcp *ca = inet_csk_ca(sk); | ||
92 | |||
93 | ca->prior_snd_una = tp->snd_una; | ||
94 | ca->prior_rcv_nxt = tp->rcv_nxt; | ||
95 | |||
96 | ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); | ||
97 | |||
98 | ca->delayed_ack_reserved = 0; | ||
99 | ca->ce_state = 0; | ||
100 | |||
101 | dctcp_reset(tp, ca); | ||
102 | return; | ||
103 | } | ||
104 | |||
105 | /* No ECN support? Fall back to Reno. Also need to clear | ||
106 | * ECT from sk since it is set during 3WHS for DCTCP. | ||
107 | */ | ||
108 | inet_csk(sk)->icsk_ca_ops = &dctcp_reno; | ||
109 | INET_ECN_dontxmit(sk); | ||
110 | } | ||
111 | |||
112 | static u32 dctcp_ssthresh(struct sock *sk) | ||
113 | { | ||
114 | const struct dctcp *ca = inet_csk_ca(sk); | ||
115 | struct tcp_sock *tp = tcp_sk(sk); | ||
116 | |||
117 | return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); | ||
118 | } | ||
119 | |||
120 | /* Minimal DCTP CE state machine: | ||
121 | * | ||
122 | * S: 0 <- last pkt was non-CE | ||
123 | * 1 <- last pkt was CE | ||
124 | */ | ||
125 | |||
126 | static void dctcp_ce_state_0_to_1(struct sock *sk) | ||
127 | { | ||
128 | struct dctcp *ca = inet_csk_ca(sk); | ||
129 | struct tcp_sock *tp = tcp_sk(sk); | ||
130 | |||
131 | /* State has changed from CE=0 to CE=1 and delayed | ||
132 | * ACK has not sent yet. | ||
133 | */ | ||
134 | if (!ca->ce_state && ca->delayed_ack_reserved) { | ||
135 | u32 tmp_rcv_nxt; | ||
136 | |||
137 | /* Save current rcv_nxt. */ | ||
138 | tmp_rcv_nxt = tp->rcv_nxt; | ||
139 | |||
140 | /* Generate previous ack with CE=0. */ | ||
141 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | ||
142 | tp->rcv_nxt = ca->prior_rcv_nxt; | ||
143 | |||
144 | tcp_send_ack(sk); | ||
145 | |||
146 | /* Recover current rcv_nxt. */ | ||
147 | tp->rcv_nxt = tmp_rcv_nxt; | ||
148 | } | ||
149 | |||
150 | ca->prior_rcv_nxt = tp->rcv_nxt; | ||
151 | ca->ce_state = 1; | ||
152 | |||
153 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | ||
154 | } | ||
155 | |||
156 | static void dctcp_ce_state_1_to_0(struct sock *sk) | ||
157 | { | ||
158 | struct dctcp *ca = inet_csk_ca(sk); | ||
159 | struct tcp_sock *tp = tcp_sk(sk); | ||
160 | |||
161 | /* State has changed from CE=1 to CE=0 and delayed | ||
162 | * ACK has not sent yet. | ||
163 | */ | ||
164 | if (ca->ce_state && ca->delayed_ack_reserved) { | ||
165 | u32 tmp_rcv_nxt; | ||
166 | |||
167 | /* Save current rcv_nxt. */ | ||
168 | tmp_rcv_nxt = tp->rcv_nxt; | ||
169 | |||
170 | /* Generate previous ack with CE=1. */ | ||
171 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | ||
172 | tp->rcv_nxt = ca->prior_rcv_nxt; | ||
173 | |||
174 | tcp_send_ack(sk); | ||
175 | |||
176 | /* Recover current rcv_nxt. */ | ||
177 | tp->rcv_nxt = tmp_rcv_nxt; | ||
178 | } | ||
179 | |||
180 | ca->prior_rcv_nxt = tp->rcv_nxt; | ||
181 | ca->ce_state = 0; | ||
182 | |||
183 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | ||
184 | } | ||
185 | |||
186 | static void dctcp_update_alpha(struct sock *sk, u32 flags) | ||
187 | { | ||
188 | const struct tcp_sock *tp = tcp_sk(sk); | ||
189 | struct dctcp *ca = inet_csk_ca(sk); | ||
190 | u32 acked_bytes = tp->snd_una - ca->prior_snd_una; | ||
191 | |||
192 | /* If ack did not advance snd_una, count dupack as MSS size. | ||
193 | * If ack did update window, do not count it at all. | ||
194 | */ | ||
195 | if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) | ||
196 | acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; | ||
197 | if (acked_bytes) { | ||
198 | ca->acked_bytes_total += acked_bytes; | ||
199 | ca->prior_snd_una = tp->snd_una; | ||
200 | |||
201 | if (flags & CA_ACK_ECE) | ||
202 | ca->acked_bytes_ecn += acked_bytes; | ||
203 | } | ||
204 | |||
205 | /* Expired RTT */ | ||
206 | if (!before(tp->snd_una, ca->next_seq)) { | ||
207 | /* For avoiding denominator == 1. */ | ||
208 | if (ca->acked_bytes_total == 0) | ||
209 | ca->acked_bytes_total = 1; | ||
210 | |||
211 | /* alpha = (1 - g) * alpha + g * F */ | ||
212 | ca->dctcp_alpha = ca->dctcp_alpha - | ||
213 | (ca->dctcp_alpha >> dctcp_shift_g) + | ||
214 | (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / | ||
215 | ca->acked_bytes_total; | ||
216 | |||
217 | if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) | ||
218 | /* Clamp dctcp_alpha to max. */ | ||
219 | ca->dctcp_alpha = DCTCP_MAX_ALPHA; | ||
220 | |||
221 | dctcp_reset(tp, ca); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | static void dctcp_state(struct sock *sk, u8 new_state) | ||
226 | { | ||
227 | if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { | ||
228 | struct dctcp *ca = inet_csk_ca(sk); | ||
229 | |||
230 | /* If this extension is enabled, we clamp dctcp_alpha to | ||
231 | * max on packet loss; the motivation is that dctcp_alpha | ||
232 | * is an indicator to the extend of congestion and packet | ||
233 | * loss is an indicator of extreme congestion; setting | ||
234 | * this in practice turned out to be beneficial, and | ||
235 | * effectively assumes total congestion which reduces the | ||
236 | * window by half. | ||
237 | */ | ||
238 | ca->dctcp_alpha = DCTCP_MAX_ALPHA; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) | ||
243 | { | ||
244 | struct dctcp *ca = inet_csk_ca(sk); | ||
245 | |||
246 | switch (ev) { | ||
247 | case CA_EVENT_DELAYED_ACK: | ||
248 | if (!ca->delayed_ack_reserved) | ||
249 | ca->delayed_ack_reserved = 1; | ||
250 | break; | ||
251 | case CA_EVENT_NON_DELAYED_ACK: | ||
252 | if (ca->delayed_ack_reserved) | ||
253 | ca->delayed_ack_reserved = 0; | ||
254 | break; | ||
255 | default: | ||
256 | /* Don't care for the rest. */ | ||
257 | break; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) | ||
262 | { | ||
263 | switch (ev) { | ||
264 | case CA_EVENT_ECN_IS_CE: | ||
265 | dctcp_ce_state_0_to_1(sk); | ||
266 | break; | ||
267 | case CA_EVENT_ECN_NO_CE: | ||
268 | dctcp_ce_state_1_to_0(sk); | ||
269 | break; | ||
270 | case CA_EVENT_DELAYED_ACK: | ||
271 | case CA_EVENT_NON_DELAYED_ACK: | ||
272 | dctcp_update_ack_reserved(sk, ev); | ||
273 | break; | ||
274 | default: | ||
275 | /* Don't care for the rest. */ | ||
276 | break; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) | ||
281 | { | ||
282 | const struct dctcp *ca = inet_csk_ca(sk); | ||
283 | |||
284 | /* Fill it also in case of VEGASINFO due to req struct limits. | ||
285 | * We can still correctly retrieve it later. | ||
286 | */ | ||
287 | if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || | ||
288 | ext & (1 << (INET_DIAG_VEGASINFO - 1))) { | ||
289 | struct tcp_dctcp_info info; | ||
290 | |||
291 | memset(&info, 0, sizeof(info)); | ||
292 | if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { | ||
293 | info.dctcp_enabled = 1; | ||
294 | info.dctcp_ce_state = (u16) ca->ce_state; | ||
295 | info.dctcp_alpha = ca->dctcp_alpha; | ||
296 | info.dctcp_ab_ecn = ca->acked_bytes_ecn; | ||
297 | info.dctcp_ab_tot = ca->acked_bytes_total; | ||
298 | } | ||
299 | |||
300 | nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | static struct tcp_congestion_ops dctcp __read_mostly = { | ||
305 | .init = dctcp_init, | ||
306 | .in_ack_event = dctcp_update_alpha, | ||
307 | .cwnd_event = dctcp_cwnd_event, | ||
308 | .ssthresh = dctcp_ssthresh, | ||
309 | .cong_avoid = tcp_reno_cong_avoid, | ||
310 | .set_state = dctcp_state, | ||
311 | .get_info = dctcp_get_info, | ||
312 | .flags = TCP_CONG_NEEDS_ECN, | ||
313 | .owner = THIS_MODULE, | ||
314 | .name = "dctcp", | ||
315 | }; | ||
316 | |||
317 | static struct tcp_congestion_ops dctcp_reno __read_mostly = { | ||
318 | .ssthresh = tcp_reno_ssthresh, | ||
319 | .cong_avoid = tcp_reno_cong_avoid, | ||
320 | .get_info = dctcp_get_info, | ||
321 | .owner = THIS_MODULE, | ||
322 | .name = "dctcp-reno", | ||
323 | }; | ||
324 | |||
325 | static int __init dctcp_register(void) | ||
326 | { | ||
327 | BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); | ||
328 | return tcp_register_congestion_control(&dctcp); | ||
329 | } | ||
330 | |||
331 | static void __exit dctcp_unregister(void) | ||
332 | { | ||
333 | tcp_unregister_congestion_control(&dctcp); | ||
334 | } | ||
335 | |||
336 | module_init(dctcp_register); | ||
337 | module_exit(dctcp_unregister); | ||
338 | |||
339 | MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); | ||
340 | MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); | ||
341 | MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); | ||
342 | |||
343 | MODULE_LICENSE("GPL v2"); | ||
344 | MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5073eefa6fae..fc133178c787 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s | |||
233 | tcp_enter_quickack_mode((struct sock *)tp); | 233 | tcp_enter_quickack_mode((struct sock *)tp); |
234 | break; | 234 | break; |
235 | case INET_ECN_CE: | 235 | case INET_ECN_CE: |
236 | if (tcp_ca_needs_ecn((struct sock *)tp)) | ||
237 | tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); | ||
238 | |||
236 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { | 239 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { |
237 | /* Better not delay acks, sender can have a very low cwnd */ | 240 | /* Better not delay acks, sender can have a very low cwnd */ |
238 | tcp_enter_quickack_mode((struct sock *)tp); | 241 | tcp_enter_quickack_mode((struct sock *)tp); |
239 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | 242 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; |
240 | } | 243 | } |
241 | /* fallinto */ | 244 | tp->ecn_flags |= TCP_ECN_SEEN; |
245 | break; | ||
242 | default: | 246 | default: |
247 | if (tcp_ca_needs_ecn((struct sock *)tp)) | ||
248 | tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); | ||
243 | tp->ecn_flags |= TCP_ECN_SEEN; | 249 | tp->ecn_flags |= TCP_ECN_SEEN; |
250 | break; | ||
244 | } | 251 | } |
245 | } | 252 | } |
246 | 253 | ||
@@ -3362,6 +3369,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) | |||
3362 | } | 3369 | } |
3363 | } | 3370 | } |
3364 | 3371 | ||
3372 | static inline void tcp_in_ack_event(struct sock *sk, u32 flags) | ||
3373 | { | ||
3374 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
3375 | |||
3376 | if (icsk->icsk_ca_ops->in_ack_event) | ||
3377 | icsk->icsk_ca_ops->in_ack_event(sk, flags); | ||
3378 | } | ||
3379 | |||
3365 | /* This routine deals with incoming acks, but not outgoing ones. */ | 3380 | /* This routine deals with incoming acks, but not outgoing ones. */ |
3366 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | 3381 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) |
3367 | { | 3382 | { |
@@ -3421,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3421 | tp->snd_una = ack; | 3436 | tp->snd_una = ack; |
3422 | flag |= FLAG_WIN_UPDATE; | 3437 | flag |= FLAG_WIN_UPDATE; |
3423 | 3438 | ||
3424 | tcp_ca_event(sk, CA_EVENT_FAST_ACK); | 3439 | tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); |
3425 | 3440 | ||
3426 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); | 3441 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); |
3427 | } else { | 3442 | } else { |
3443 | u32 ack_ev_flags = CA_ACK_SLOWPATH; | ||
3444 | |||
3428 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 3445 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
3429 | flag |= FLAG_DATA; | 3446 | flag |= FLAG_DATA; |
3430 | else | 3447 | else |
@@ -3436,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3436 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3453 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3437 | &sack_rtt_us); | 3454 | &sack_rtt_us); |
3438 | 3455 | ||
3439 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) | 3456 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { |
3440 | flag |= FLAG_ECE; | 3457 | flag |= FLAG_ECE; |
3458 | ack_ev_flags |= CA_ACK_ECE; | ||
3459 | } | ||
3460 | |||
3461 | if (flag & FLAG_WIN_UPDATE) | ||
3462 | ack_ev_flags |= CA_ACK_WIN_UPDATE; | ||
3441 | 3463 | ||
3442 | tcp_ca_event(sk, CA_EVENT_SLOW_ACK); | 3464 | tcp_in_ack_event(sk, ack_ev_flags); |
3443 | } | 3465 | } |
3444 | 3466 | ||
3445 | /* We passed data and got it acked, remove any soft error | 3467 | /* We passed data and got it acked, remove any soft error |
@@ -5944,7 +5966,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
5944 | goto drop_and_free; | 5966 | goto drop_and_free; |
5945 | 5967 | ||
5946 | if (!want_cookie || tmp_opt.tstamp_ok) | 5968 | if (!want_cookie || tmp_opt.tstamp_ok) |
5947 | TCP_ECN_create_request(req, skb, sock_net(sk)); | 5969 | TCP_ECN_create_request(req, skb, sk); |
5948 | 5970 | ||
5949 | if (want_cookie) { | 5971 | if (want_cookie) { |
5950 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); | 5972 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a058f411d3a6..47b73506b77e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
451 | newtp->snd_cwnd = TCP_INIT_CWND; | 451 | newtp->snd_cwnd = TCP_INIT_CWND; |
452 | newtp->snd_cwnd_cnt = 0; | 452 | newtp->snd_cwnd_cnt = 0; |
453 | 453 | ||
454 | if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && | 454 | if (!try_module_get(newicsk->icsk_ca_ops->owner)) |
455 | !try_module_get(newicsk->icsk_ca_ops->owner)) | 455 | tcp_assign_congestion_control(newsk); |
456 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; | ||
457 | 456 | ||
458 | tcp_set_ca_state(newsk, TCP_CA_Open); | 457 | tcp_set_ca_state(newsk, TCP_CA_Open); |
459 | tcp_init_xmit_timers(newsk); | 458 | tcp_init_xmit_timers(newsk); |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d92703df4c6..86a0216fcaa1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk) | |||
318 | } | 318 | } |
319 | 319 | ||
320 | /* Packet ECN state for a SYN-ACK */ | 320 | /* Packet ECN state for a SYN-ACK */ |
321 | static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) | 321 | static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) |
322 | { | 322 | { |
323 | const struct tcp_sock *tp = tcp_sk(sk); | ||
324 | |||
323 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; | 325 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; |
324 | if (!(tp->ecn_flags & TCP_ECN_OK)) | 326 | if (!(tp->ecn_flags & TCP_ECN_OK)) |
325 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; | 327 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; |
328 | else if (tcp_ca_needs_ecn(sk)) | ||
329 | INET_ECN_xmit(sk); | ||
326 | } | 330 | } |
327 | 331 | ||
328 | /* Packet ECN state for a SYN. */ | 332 | /* Packet ECN state for a SYN. */ |
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) | |||
331 | struct tcp_sock *tp = tcp_sk(sk); | 335 | struct tcp_sock *tp = tcp_sk(sk); |
332 | 336 | ||
333 | tp->ecn_flags = 0; | 337 | tp->ecn_flags = 0; |
334 | if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { | 338 | if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || |
339 | tcp_ca_needs_ecn(sk)) { | ||
335 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; | 340 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; |
336 | tp->ecn_flags = TCP_ECN_OK; | 341 | tp->ecn_flags = TCP_ECN_OK; |
342 | if (tcp_ca_needs_ecn(sk)) | ||
343 | INET_ECN_xmit(sk); | ||
337 | } | 344 | } |
338 | } | 345 | } |
339 | 346 | ||
340 | static __inline__ void | 347 | static __inline__ void |
341 | TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) | 348 | TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, |
349 | struct sock *sk) | ||
342 | { | 350 | { |
343 | if (inet_rsk(req)->ecn_ok) | 351 | if (inet_rsk(req)->ecn_ok) { |
344 | th->ece = 1; | 352 | th->ece = 1; |
353 | if (tcp_ca_needs_ecn(sk)) | ||
354 | INET_ECN_xmit(sk); | ||
355 | } | ||
345 | } | 356 | } |
346 | 357 | ||
347 | /* Set up ECN state for a packet on a ESTABLISHED socket that is about to | 358 | /* Set up ECN state for a packet on a ESTABLISHED socket that is about to |
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, | |||
362 | tcp_hdr(skb)->cwr = 1; | 373 | tcp_hdr(skb)->cwr = 1; |
363 | skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; | 374 | skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; |
364 | } | 375 | } |
365 | } else { | 376 | } else if (!tcp_ca_needs_ecn(sk)) { |
366 | /* ACK or retransmitted segment: clear ECT|CE */ | 377 | /* ACK or retransmitted segment: clear ECT|CE */ |
367 | INET_ECN_dontxmit(sk); | 378 | INET_ECN_dontxmit(sk); |
368 | } | 379 | } |
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) | |||
2789 | } | 2800 | } |
2790 | 2801 | ||
2791 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; | 2802 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; |
2792 | TCP_ECN_send_synack(tcp_sk(sk), skb); | 2803 | TCP_ECN_send_synack(sk, skb); |
2793 | } | 2804 | } |
2794 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2805 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2795 | } | 2806 | } |
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2848 | memset(th, 0, sizeof(struct tcphdr)); | 2859 | memset(th, 0, sizeof(struct tcphdr)); |
2849 | th->syn = 1; | 2860 | th->syn = 1; |
2850 | th->ack = 1; | 2861 | th->ack = 1; |
2851 | TCP_ECN_make_synack(req, th); | 2862 | TCP_ECN_make_synack(req, th, sk); |
2852 | th->source = htons(ireq->ir_num); | 2863 | th->source = htons(ireq->ir_num); |
2853 | th->dest = ireq->ir_rmt_port; | 2864 | th->dest = ireq->ir_rmt_port; |
2854 | /* Setting of flags are superfluous here for callers (and ECE is | 2865 | /* Setting of flags are superfluous here for callers (and ECE is |
@@ -3119,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk) | |||
3119 | int ato = icsk->icsk_ack.ato; | 3130 | int ato = icsk->icsk_ack.ato; |
3120 | unsigned long timeout; | 3131 | unsigned long timeout; |
3121 | 3132 | ||
3133 | tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); | ||
3134 | |||
3122 | if (ato > TCP_DELACK_MIN) { | 3135 | if (ato > TCP_DELACK_MIN) { |
3123 | const struct tcp_sock *tp = tcp_sk(sk); | 3136 | const struct tcp_sock *tp = tcp_sk(sk); |
3124 | int max_ato = HZ / 2; | 3137 | int max_ato = HZ / 2; |
@@ -3175,6 +3188,8 @@ void tcp_send_ack(struct sock *sk) | |||
3175 | if (sk->sk_state == TCP_CLOSE) | 3188 | if (sk->sk_state == TCP_CLOSE) |
3176 | return; | 3189 | return; |
3177 | 3190 | ||
3191 | tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); | ||
3192 | |||
3178 | /* We are not putting this on the write queue, so | 3193 | /* We are not putting this on the write queue, so |
3179 | * tcp_transmit_skb() will set the ownership to this | 3194 | * tcp_transmit_skb() will set the ownership to this |
3180 | * sock. | 3195 | * sock. |
@@ -3196,6 +3211,7 @@ void tcp_send_ack(struct sock *sk) | |||
3196 | skb_mstamp_get(&buff->skb_mstamp); | 3211 | skb_mstamp_get(&buff->skb_mstamp); |
3197 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); | 3212 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); |
3198 | } | 3213 | } |
3214 | EXPORT_SYMBOL_GPL(tcp_send_ack); | ||
3199 | 3215 | ||
3200 | /* This routine sends a packet with an out of date sequence | 3216 | /* This routine sends a packet with an out of date sequence |
3201 | * number. It assumes the other end will try to ack it. | 3217 | * number. It assumes the other end will try to ack it. |
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 81911a92356c..bb63fba47d47 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) | |||
220 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | 220 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); |
221 | } | 221 | } |
222 | 222 | ||
223 | static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) | ||
224 | { | ||
225 | if (ack_flags & CA_ACK_SLOWPATH) { | ||
226 | struct westwood *w = inet_csk_ca(sk); | ||
227 | |||
228 | westwood_update_window(sk); | ||
229 | w->bk += westwood_acked_count(sk); | ||
230 | |||
231 | update_rtt_min(w); | ||
232 | return; | ||
233 | } | ||
234 | |||
235 | westwood_fast_bw(sk); | ||
236 | } | ||
237 | |||
223 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | 238 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) |
224 | { | 239 | { |
225 | struct tcp_sock *tp = tcp_sk(sk); | 240 | struct tcp_sock *tp = tcp_sk(sk); |
226 | struct westwood *w = inet_csk_ca(sk); | 241 | struct westwood *w = inet_csk_ca(sk); |
227 | 242 | ||
228 | switch (event) { | 243 | switch (event) { |
229 | case CA_EVENT_FAST_ACK: | ||
230 | westwood_fast_bw(sk); | ||
231 | break; | ||
232 | |||
233 | case CA_EVENT_COMPLETE_CWR: | 244 | case CA_EVENT_COMPLETE_CWR: |
234 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); | 245 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
235 | break; | 246 | break; |
236 | |||
237 | case CA_EVENT_LOSS: | 247 | case CA_EVENT_LOSS: |
238 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); | 248 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
239 | /* Update RTT_min when next ack arrives */ | 249 | /* Update RTT_min when next ack arrives */ |
240 | w->reset_rtt_min = 1; | 250 | w->reset_rtt_min = 1; |
241 | break; | 251 | break; |
242 | |||
243 | case CA_EVENT_SLOW_ACK: | ||
244 | westwood_update_window(sk); | ||
245 | w->bk += westwood_acked_count(sk); | ||
246 | update_rtt_min(w); | ||
247 | break; | ||
248 | |||
249 | default: | 252 | default: |
250 | /* don't care */ | 253 | /* don't care */ |
251 | break; | 254 | break; |
@@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { | |||
274 | .ssthresh = tcp_reno_ssthresh, | 277 | .ssthresh = tcp_reno_ssthresh, |
275 | .cong_avoid = tcp_reno_cong_avoid, | 278 | .cong_avoid = tcp_reno_cong_avoid, |
276 | .cwnd_event = tcp_westwood_event, | 279 | .cwnd_event = tcp_westwood_event, |
280 | .in_ack_event = tcp_westwood_ack, | ||
277 | .get_info = tcp_westwood_info, | 281 | .get_info = tcp_westwood_info, |
278 | .pkts_acked = tcp_westwood_pkts_acked, | 282 | .pkts_acked = tcp_westwood_pkts_acked, |
279 | 283 | ||