Merge branch 'dctcp'

Daniel Borkmann says: ==================== net: tcp: DCTCP congestion control algorithm This patch series adds support for the DataCenter TCP (DCTCP) congestion control algorithm. Please see individual patches for the details. The last patch adds DCTCP as a congestion control module, and previous ones add needed infrastructure to extend the congestion control framework. Joint work between Florian Westphal, Daniel Borkmann and Glenn Judd. v3 -> v2: - No changes anywhere, just a resend as requested by Dave - Added Stephen's ACK v1 -> v2: - Rebased to latest net-next - Addressed Eric's feedback, thanks! - Update stale comment wrt. DCTCP ECN usage - Don't call INET_ECN_xmit for every packet - Add dctcp ss/inetdiag support to expose internal stats to userspace ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2014-09-29 00:13:17 -0400
committer: David S. Miller <davem@davemloft.net> 2014-09-29 00:13:17 -0400
commit: a11238ec28d40f56f8b939f6f125694dba3adb70 (patch)
tree: 3a13df46a74af91d928dc4ac5150c2815ee42207
parent: 53dfd501819a6e9c3a7d56cac1ddaf03fe90800d (diff)
parent: e3118e8359bb7c59555aca60c725106e6d78c5ce (diff)
12 files changed, 574 insertions, 78 deletions
diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt
new file mode 100644
index 000000000000..0d5dfbc89ec9
--- /dev/null
+++ b/Documentation/networking/dctcp.txt
@@ -0,0 +1,43 @@
+DCTCP (DataCenter TCP)
+----------------------
+DCTCP is an enhancement to the TCP congestion control algorithm for data
+center networks and leverages Explicit Congestion Notification (ECN) in
+the data center network to provide multi-bit feedback to the end hosts.
+To enable it on end hosts:
+  sysctl -w net.ipv4.tcp_congestion_control=dctcp
+All switches in the data center network running DCTCP must support ECN
+marking and be configured for marking when reaching defined switch buffer
+thresholds. The default ECN marking threshold heuristic for DCTCP on
+switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
+but might need further careful tweaking.
+For more details, see below documents:
+Paper:
+The algorithm is further described in detail in the following two
+SIGCOMM/SIGMETRICS papers:
+ i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+      "Data Center TCP (DCTCP)", Data Center Networks session
+      Proc. ACM SIGCOMM, New Delhi, 2010.
+    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+    http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
+ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+      "Analysis of DCTCP: Stability, Convergence, and Fairness"
+      Proc. ACM SIGMETRICS, San Jose, 2011.
+    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+IETF informational draft:
+  http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
+DCTCP site:
+  http://simula.stanford.edu/~alizade/Site/DCTCP.html
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 02a9a2c366bf..1f57c5363492 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
 #define TCP_SKB_CB(__skb)       ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
- *
- * If we receive a SYN packet with these bits set, it means a network is
- * playing bad games with TOS bits. In order to avoid possible false congestion
- * notifications, we disable TCP ECN negociation.
- */
-static inline void
-TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
-                struct net *net)
-{
-        const struct tcphdr *th = tcp_hdr(skb);
-        if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
-            INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
-                inet_rsk(req)->ecn_ok = 1;
-}
 /* Due to TSO, an SKB can be composed of multiple actual
 * packets.  To keep these tracked properly, we use this.
 */
@@ -780,8 +763,17 @@ enum tcp_ca_event {
        CA_EVENT_CWND_RESTART,  /* congestion window restart */
        CA_EVENT_COMPLETE_CWR,  /* end of congestion recovery */
        CA_EVENT_LOSS,          /* loss timeout */
-        CA_EVENT_FAST_ACK,      /* in sequence ack */
+        CA_EVENT_ECN_NO_CE,     /* ECT set, but not CE marked */
-        CA_EVENT_SLOW_ACK,      /* other ack */
+        CA_EVENT_ECN_IS_CE,     /* received CE marked IP packet */
+        CA_EVENT_DELAYED_ACK,   /* Delayed ack is sent */
+        CA_EVENT_NON_DELAYED_ACK,
+};
+/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+enum tcp_ca_ack_event_flags {
+        CA_ACK_SLOWPATH         = (1 << 0),     /* In slow path processing */
+        CA_ACK_WIN_UPDATE       = (1 << 1),     /* ACK updated window */
+        CA_ACK_ECE              = (1 << 2),     /* ECE bit is set on ack */
 };
 /*
@@ -791,7 +783,10 @@ enum tcp_ca_event {
 #define TCP_CA_MAX      128
 #define TCP_CA_BUF_MAX  (TCP_CA_NAME_MAX*TCP_CA_MAX)
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
 #define TCP_CONG_NON_RESTRICTED 0x1
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN      0x2
 struct tcp_congestion_ops {
        struct list_head        list;
@@ -810,6 +805,8 @@ struct tcp_congestion_ops {
        void (*set_state)(struct sock *sk, u8 new_state);
        /* call when cwnd event occurs (optional) */
        void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+        /* call when ack arrives (optional) */
+        void (*in_ack_event)(struct sock *sk, u32 flags);
        /* new value of cwnd after loss (optional) */
        u32  (*undo_cwnd)(struct sock *sk);
        /* hook for packet ack accounting (optional) */
@@ -824,6 +821,7 @@ struct tcp_congestion_ops {
 int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
 void tcp_cleanup_congestion_control(struct sock *sk);
 int tcp_set_default_congestion_control(const char *name);
@@ -835,11 +833,17 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
 int tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
-extern struct tcp_congestion_ops tcp_init_congestion_ops;
 u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
+static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
+}
 static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +861,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
                icsk->icsk_ca_ops->cwnd_event(sk, event);
 }
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static inline void
+TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
+                       const struct sock *listen_sk)
+{
+        const struct tcphdr *th = tcp_hdr(skb);
+        const struct net *net = sock_net(listen_sk);
+        bool th_ecn = th->ece && th->cwr;
+        bool ect, need_ecn;
+        if (!th_ecn)
+                return;
+        ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+        need_ecn = tcp_ca_needs_ecn(listen_sk);
+        if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+                inet_rsk(req)->ecn_ok = 1;
+        else if (ect && need_ecn)
+                inet_rsk(req)->ecn_ok = 1;
+}
 /* These functions determine how the current flow behaves in respect of SACK
 * handling. SACK is negotiated with the peer, and therefore it can vary
 * between different flows.
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bbde90fa5838..d65c0a09efd3 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -110,10 +110,10 @@ enum {
        INET_DIAG_TCLASS,
        INET_DIAG_SKMEMINFO,
        INET_DIAG_SHUTDOWN,
+        INET_DIAG_DCTCPINFO,
 };
-#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
+#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
 /* INET_DIAG_MEM */
@@ -133,5 +133,14 @@ struct tcpvegas_info {
        __u32   tcpv_minrtt;
 };
+/* INET_DIAG_DCTCPINFO */
+struct tcp_dctcp_info {
+        __u16   dctcp_enabled;
+        __u16   dctcp_ce_state;
+        __u32   dctcp_alpha;
+        __u32   dctcp_ab_ecn;
+        __u32   dctcp_ab_tot;
+};
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 84f710b7472a..69fb37854449 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS
        For further details see:
          http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+config TCP_CONG_DCTCP
+        tristate "DataCenter TCP (DCTCP)"
+        default n
+        ---help---
+        DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+        provide multi-bit feedback to the end hosts. It is designed to provide:
+        - High burst tolerance (incast due to partition/aggregate),
+        - Low latency (short flows, queries),
+        - High throughput (continuous data updates, large file transfers) with
+          commodity, shallow-buffered switches.
+        All switches in the data center network running DCTCP must support
+        ECN marking and be configured for marking when reaching defined switch
+        buffer thresholds. The default ECN marking threshold heuristic for
+        DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+        (~100KB) at 10Gbps, but might need further careful tweaking.
+        For further details see:
+          http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
 choice
        prompt "Default TCP congestion control"
        default DEFAULT_CUBIC
@@ -598,9 +619,11 @@ choice
        config DEFAULT_WESTWOOD
                bool "Westwood" if TCP_CONG_WESTWOOD=y
+        config DEFAULT_DCTCP
+                bool "DCTCP" if TCP_CONG_DCTCP=y
        config DEFAULT_RENO
                bool "Reno"
 endchoice
 endif
@@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG
        default "westwood" if DEFAULT_WESTWOOD
        default "veno" if DEFAULT_VENO
        default "reno" if DEFAULT_RENO
+        default "dctcp" if DEFAULT_DCTCP
        default "cubic"
 config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d78d404c596f..d8105787c199 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87289e51be00..cf5e508e1ef5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
        tp->reordering = sysctl_tcp_reordering;
        tcp_enable_early_retrans(tp);
-        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+        tcp_assign_congestion_control(sk);
        tp->tsoffset = 0;
@@ -3258,8 +3258,6 @@ void __init tcp_init(void)
                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
        tcp_metrics_init();
+        BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
-        tcp_register_congestion_control(&tcp_reno);
        tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 80248f56c89f..a6c8a5775624 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 /* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct sock *sk)
+void tcp_assign_congestion_control(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_congestion_ops *ca;
-        /* if no choice made yet assign the current value set as default */
+        rcu_read_lock();
-        if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
+        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
-                rcu_read_lock();
+                if (likely(try_module_get(ca->owner))) {
-                list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+                        icsk->icsk_ca_ops = ca;
-                        if (try_module_get(ca->owner)) {
+                        goto out;
-                                icsk->icsk_ca_ops = ca;
-                                break;
-                        }
-                        /* fallback to next available */
                }
-                rcu_read_unlock();
+                /* Fallback to next available. The last really
+                 * guaranteed fallback is Reno from this list.
+                 */
        }
+out:
+        rcu_read_unlock();
+        /* Clear out private data before diag gets it and
+         * the ca has not been initialized.
+         */
+        if (ca->get_info)
+                memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+void tcp_init_congestion_control(struct sock *sk)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        if (icsk->icsk_ca_ops->init)
                icsk->icsk_ca_ops->init(sk);
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
 };
-/* Initial congestion control used (until SYN)
- * really reno under another name so we can tell difference
- * during tcp_set_default_congestion_control
- */
-struct tcp_congestion_ops tcp_init_congestion_ops  = {
-        .name           = "",
-        .owner          = THIS_MODULE,
-        .ssthresh       = tcp_reno_ssthresh,
-        .cong_avoid     = tcp_reno_cong_avoid,
-};
-EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..b504371af742
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,344 @@
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ *  - High burst tolerance (incast due to partition/aggregate)
+ *  - Low latency (short flows, queries)
+ *  - High throughput (continuous data updates, large file transfers)
+ *    with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ *    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ *      "Data Center TCP (DCTCP)", Data Center Networks session
+ *      Proc. ACM SIGCOMM, New Delhi, 2010.
+ *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ *      "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ *      Proc. ACM SIGMETRICS, San Jose, 2011.
+ *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ *      Daniel Borkmann <dborkman@redhat.com>
+ *      Florian Westphal <fw@strlen.de>
+ *      Glenn Judd <glenn.judd@morganstanley.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#define DCTCP_MAX_ALPHA 1024U
+struct dctcp {
+        u32 acked_bytes_ecn;
+        u32 acked_bytes_total;
+        u32 prior_snd_una;
+        u32 prior_rcv_nxt;
+        u32 dctcp_alpha;
+        u32 next_seq;
+        u32 ce_state;
+        u32 delayed_ack_reserved;
+};
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+                 "parameter for clamping alpha on loss");
+static struct tcp_congestion_ops dctcp_reno;
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+        ca->next_seq = tp->snd_nxt;
+        ca->acked_bytes_ecn = 0;
+        ca->acked_bytes_total = 0;
+}
+static void dctcp_init(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        if ((tp->ecn_flags & TCP_ECN_OK) ||
+            (sk->sk_state == TCP_LISTEN ||
+             sk->sk_state == TCP_CLOSE)) {
+                struct dctcp *ca = inet_csk_ca(sk);
+                ca->prior_snd_una = tp->snd_una;
+                ca->prior_rcv_nxt = tp->rcv_nxt;
+                ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+                ca->delayed_ack_reserved = 0;
+                ca->ce_state = 0;
+                dctcp_reset(tp, ca);
+                return;
+        }
+        /* No ECN support? Fall back to Reno. Also need to clear
+         * ECT from sk since it is set during 3WHS for DCTCP.
+         */
+        inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+        INET_ECN_dontxmit(sk);
+}
+static u32 dctcp_ssthresh(struct sock *sk)
+{
+        const struct dctcp *ca = inet_csk_ca(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+/* Minimal DCTP CE state machine:
+ *
+ * S:   0 <- last pkt was non-CE
+ *      1 <- last pkt was CE
+ */
+static void dctcp_ce_state_0_to_1(struct sock *sk)
+{
+        struct dctcp *ca = inet_csk_ca(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* State has changed from CE=0 to CE=1 and delayed
+         * ACK has not sent yet.
+         */
+        if (!ca->ce_state && ca->delayed_ack_reserved) {
+                u32 tmp_rcv_nxt;
+                /* Save current rcv_nxt. */
+                tmp_rcv_nxt = tp->rcv_nxt;
+                /* Generate previous ack with CE=0. */
+                tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+                tp->rcv_nxt = ca->prior_rcv_nxt;
+                tcp_send_ack(sk);
+                /* Recover current rcv_nxt. */
+                tp->rcv_nxt = tmp_rcv_nxt;
+        }
+        ca->prior_rcv_nxt = tp->rcv_nxt;
+        ca->ce_state = 1;
+        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+}
+static void dctcp_ce_state_1_to_0(struct sock *sk)
+{
+        struct dctcp *ca = inet_csk_ca(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* State has changed from CE=1 to CE=0 and delayed
+         * ACK has not sent yet.
+         */
+        if (ca->ce_state && ca->delayed_ack_reserved) {
+                u32 tmp_rcv_nxt;
+                /* Save current rcv_nxt. */
+                tmp_rcv_nxt = tp->rcv_nxt;
+                /* Generate previous ack with CE=1. */
+                tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+                tp->rcv_nxt = ca->prior_rcv_nxt;
+                tcp_send_ack(sk);
+                /* Recover current rcv_nxt. */
+                tp->rcv_nxt = tmp_rcv_nxt;
+        }
+        ca->prior_rcv_nxt = tp->rcv_nxt;
+        ca->ce_state = 0;
+        tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct dctcp *ca = inet_csk_ca(sk);
+        u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
+        /* If ack did not advance snd_una, count dupack as MSS size.
+         * If ack did update window, do not count it at all.
+         */
+        if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
+                acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+        if (acked_bytes) {
+                ca->acked_bytes_total += acked_bytes;
+                ca->prior_snd_una = tp->snd_una;
+                if (flags & CA_ACK_ECE)
+                        ca->acked_bytes_ecn += acked_bytes;
+        }
+        /* Expired RTT */
+        if (!before(tp->snd_una, ca->next_seq)) {
+                /* For avoiding denominator == 1. */
+                if (ca->acked_bytes_total == 0)
+                        ca->acked_bytes_total = 1;
+                /* alpha = (1 - g) * alpha + g * F */
+                ca->dctcp_alpha = ca->dctcp_alpha -
+                                  (ca->dctcp_alpha >> dctcp_shift_g) +
+                                  (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
+                                  ca->acked_bytes_total;
+                if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
+                        /* Clamp dctcp_alpha to max. */
+                        ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+                dctcp_reset(tp, ca);
+        }
+}
+static void dctcp_state(struct sock *sk, u8 new_state)
+{
+        if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+                struct dctcp *ca = inet_csk_ca(sk);
+                /* If this extension is enabled, we clamp dctcp_alpha to
+                 * max on packet loss; the motivation is that dctcp_alpha
+                 * is an indicator to the extend of congestion and packet
+                 * loss is an indicator of extreme congestion; setting
+                 * this in practice turned out to be beneficial, and
+                 * effectively assumes total congestion which reduces the
+                 * window by half.
+                 */
+                ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+        }
+}
+static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
+{
+        struct dctcp *ca = inet_csk_ca(sk);
+        switch (ev) {
+        case CA_EVENT_DELAYED_ACK:
+                if (!ca->delayed_ack_reserved)
+                        ca->delayed_ack_reserved = 1;
+                break;
+        case CA_EVENT_NON_DELAYED_ACK:
+                if (ca->delayed_ack_reserved)
+                        ca->delayed_ack_reserved = 0;
+                break;
+        default:
+                /* Don't care for the rest. */
+                break;
+        }
+}
+static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+        switch (ev) {
+        case CA_EVENT_ECN_IS_CE:
+                dctcp_ce_state_0_to_1(sk);
+                break;
+        case CA_EVENT_ECN_NO_CE:
+                dctcp_ce_state_1_to_0(sk);
+                break;
+        case CA_EVENT_DELAYED_ACK:
+        case CA_EVENT_NON_DELAYED_ACK:
+                dctcp_update_ack_reserved(sk, ev);
+                break;
+        default:
+                /* Don't care for the rest. */
+                break;
+        }
+}
+static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+        const struct dctcp *ca = inet_csk_ca(sk);
+        /* Fill it also in case of VEGASINFO due to req struct limits.
+         * We can still correctly retrieve it later.
+         */
+        if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+            ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+                struct tcp_dctcp_info info;
+                memset(&info, 0, sizeof(info));
+                if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+                        info.dctcp_enabled = 1;
+                        info.dctcp_ce_state = (u16) ca->ce_state;
+                        info.dctcp_alpha = ca->dctcp_alpha;
+                        info.dctcp_ab_ecn = ca->acked_bytes_ecn;
+                        info.dctcp_ab_tot = ca->acked_bytes_total;
+                }
+                nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+        }
+}
+static struct tcp_congestion_ops dctcp __read_mostly = {
+        .init           = dctcp_init,
+        .in_ack_event   = dctcp_update_alpha,
+        .cwnd_event     = dctcp_cwnd_event,
+        .ssthresh       = dctcp_ssthresh,
+        .cong_avoid     = tcp_reno_cong_avoid,
+        .set_state      = dctcp_state,
+        .get_info       = dctcp_get_info,
+        .flags          = TCP_CONG_NEEDS_ECN,
+        .owner          = THIS_MODULE,
+        .name           = "dctcp",
+};
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_reno_cong_avoid,
+        .get_info       = dctcp_get_info,
+        .owner          = THIS_MODULE,
+        .name           = "dctcp-reno",
+};
+static int __init dctcp_register(void)
+{
+        BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+        return tcp_register_congestion_control(&dctcp);
+}
+static void __exit dctcp_unregister(void)
+{
+        tcp_unregister_congestion_control(&dctcp);
+}
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5073eefa6fae..fc133178c787 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
                        tcp_enter_quickack_mode((struct sock *)tp);
                break;
        case INET_ECN_CE:
+                if (tcp_ca_needs_ecn((struct sock *)tp))
+                        tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
                if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
                        /* Better not delay acks, sender can have a very low cwnd */
                        tcp_enter_quickack_mode((struct sock *)tp);
                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
                }
-                /* fallinto */
+                tp->ecn_flags |= TCP_ECN_SEEN;
+                break;
        default:
+                if (tcp_ca_needs_ecn((struct sock *)tp))
+                        tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
                tp->ecn_flags |= TCP_ECN_SEEN;
+                break;
        }
 }
@@ -3362,6 +3369,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
        }
 }
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        if (icsk->icsk_ca_ops->in_ack_event)
+                icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3421,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                tp->snd_una = ack;
                flag |= FLAG_WIN_UPDATE;
-                tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+                tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
        } else {
+                u32 ack_ev_flags = CA_ACK_SLOWPATH;
                if (ack_seq != TCP_SKB_CB(skb)->end_seq)
                        flag |= FLAG_DATA;
                else
@@ -3436,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
                                                        &sack_rtt_us);
-                if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+                if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
                        flag |= FLAG_ECE;
+                        ack_ev_flags |= CA_ACK_ECE;
+                }
+                if (flag & FLAG_WIN_UPDATE)
+                        ack_ev_flags |= CA_ACK_WIN_UPDATE;
-                tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+                tcp_in_ack_event(sk, ack_ev_flags);
        }
        /* We passed data and got it acked, remove any soft error
@@ -5944,7 +5966,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                goto drop_and_free;
        if (!want_cookie || tmp_opt.tstamp_ok)
-                TCP_ECN_create_request(req, skb, sock_net(sk));
+                TCP_ECN_create_request(req, skb, sk);
        if (want_cookie) {
                isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a058f411d3a6..47b73506b77e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->snd_cwnd = TCP_INIT_CWND;
                newtp->snd_cwnd_cnt = 0;
-                if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
+                if (!try_module_get(newicsk->icsk_ca_ops->owner))
-                    !try_module_get(newicsk->icsk_ca_ops->owner))
+                        tcp_assign_congestion_control(newsk);
-                        newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
                tcp_set_ca_state(newsk, TCP_CA_Open);
                tcp_init_xmit_timers(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4d92703df4c6..86a0216fcaa1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
 }
 /* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
 {
+        const struct tcp_sock *tp = tcp_sk(sk);
        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
        if (!(tp->ecn_flags & TCP_ECN_OK))
                TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+        else if (tcp_ca_needs_ecn(sk))
+                INET_ECN_xmit(sk);
 }
 /* Packet ECN state for a SYN.  */
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
        struct tcp_sock *tp = tcp_sk(sk);
        tp->ecn_flags = 0;
-        if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+        if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+            tcp_ca_needs_ecn(sk)) {
                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
                tp->ecn_flags = TCP_ECN_OK;
+                if (tcp_ca_needs_ecn(sk))
+                        INET_ECN_xmit(sk);
        }
 }
 static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
+                    struct sock *sk)
 {
-        if (inet_rsk(req)->ecn_ok)
+        if (inet_rsk(req)->ecn_ok) {
                th->ece = 1;
+                if (tcp_ca_needs_ecn(sk))
+                        INET_ECN_xmit(sk);
+        }
 }
 /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
                                tcp_hdr(skb)->cwr = 1;
                                skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
                        }
-                } else {
+                } else if (!tcp_ca_needs_ecn(sk)) {
                        /* ACK or retransmitted segment: clear ECT|CE */
                        INET_ECN_dontxmit(sk);
                }
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
                }
                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
-                TCP_ECN_send_synack(tcp_sk(sk), skb);
+                TCP_ECN_send_synack(sk, skb);
        }
        return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        memset(th, 0, sizeof(struct tcphdr));
        th->syn = 1;
        th->ack = 1;
-        TCP_ECN_make_synack(req, th);
+        TCP_ECN_make_synack(req, th, sk);
        th->source = htons(ireq->ir_num);
        th->dest = ireq->ir_rmt_port;
        /* Setting of flags are superfluous here for callers (and ECE is
@@ -3119,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
        int ato = icsk->icsk_ack.ato;
        unsigned long timeout;
+        tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
        if (ato > TCP_DELACK_MIN) {
                const struct tcp_sock *tp = tcp_sk(sk);
                int max_ato = HZ / 2;
@@ -3175,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
        if (sk->sk_state == TCP_CLOSE)
                return;
+        tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
        /* We are not putting this on the write queue, so
         * tcp_transmit_skb() will set the ownership to this
         * sock.
@@ -3196,6 +3211,7 @@ void tcp_send_ack(struct sock *sk)
        skb_mstamp_get(&buff->skb_mstamp);
        tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
 }
+EXPORT_SYMBOL_GPL(tcp_send_ack);
 /* This routine sends a packet with an out of date sequence
 * number. It assumes the other end will try to ack it.
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 81911a92356c..bb63fba47d47 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
        return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+        if (ack_flags & CA_ACK_SLOWPATH) {
+                struct westwood *w = inet_csk_ca(sk);
+                westwood_update_window(sk);
+                w->bk += westwood_acked_count(sk);
+                update_rtt_min(w);
+                return;
+        }
+        westwood_fast_bw(sk);
+}
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct westwood *w = inet_csk_ca(sk);
        switch (event) {
-        case CA_EVENT_FAST_ACK:
-                westwood_fast_bw(sk);
-                break;
        case CA_EVENT_COMPLETE_CWR:
                tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
                break;
        case CA_EVENT_LOSS:
                tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
                /* Update RTT_min when next ack arrives */
                w->reset_rtt_min = 1;
                break;
-        case CA_EVENT_SLOW_ACK:
-                westwood_update_window(sk);
-                w->bk += westwood_acked_count(sk);
-                update_rtt_min(w);
-                break;
        default:
                /* don't care */
                break;
@@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
        .cwnd_event     = tcp_westwood_event,
+        .in_ack_event   = tcp_westwood_ack,
        .get_info       = tcp_westwood_info,
        .pkts_acked     = tcp_westwood_pkts_acked,
author	David S. Miller <davem@davemloft.net>	2014-09-29 00:13:17 -0400
committer	David S. Miller <davem@davemloft.net>	2014-09-29 00:13:17 -0400
commit	a11238ec28d40f56f8b939f6f125694dba3adb70 (patch)
tree	3a13df46a74af91d928dc4ac5150c2815ee42207
parent	53dfd501819a6e9c3a7d56cac1ddaf03fe90800d (diff)
parent	e3118e8359bb7c59555aca60c725106e6d78c5ce (diff)