aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2014-10-28 00:45:24 -0400
committerDavid S. Miller <davem@davemloft.net>2014-10-29 15:05:15 -0400
commitdca145ffaa8d39ea1904491ac81b92b7049372c0 (patch)
treeebe67ece694b9e54281047a6037b7d5776f9f56b
parent7aef06db0f91c7b48305d07b62edf43179adb28c (diff)
tcp: allow for bigger reordering level
While testing upcoming Yaogong patch (converting out of order queue into an RB tree), I hit the max reordering level of linux TCP stack. Reordering level was limited to 127 for no good reason, and some network setups [1] can easily reach this limit and get limited throughput. Allow a new max limit of 300, and add a sysctl to allow admins to even allow bigger (or lower) values if needed. [1] Aggregation of links, per packet load balancing, fabrics not doing deep packet inspections, alternative TCP congestion modules... Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yaogong Wang <wygivan@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/bonding.txt7
-rw-r--r--Documentation/networking/ip-sysctl.txt10
-rw-r--r--include/linux/tcp.h4
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_input.c3
6 files changed, 23 insertions, 12 deletions
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index eeb5b2e97bed..83bf4986baea 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single
2230 2230
2231 It is possible to adjust TCP/IP's congestion limits by 2231 It is possible to adjust TCP/IP's congestion limits by
2232 altering the net.ipv4.tcp_reordering sysctl parameter. The 2232 altering the net.ipv4.tcp_reordering sysctl parameter. The
2233 usual default value is 3, and the maximum useful value is 127. 2233 usual default value is 3. But keep in mind TCP stack is able
2234 For a four interface balance-rr bond, expect that a single 2234 to automatically increase this when it detects reorders.
2235 TCP/IP stream will utilize no more than approximately 2.3
2236 interface's worth of throughput, even after adjusting
2237 tcp_reordering.
2238 2235
2239 Note that the fraction of packets that will be delivered out of 2236 Note that the fraction of packets that will be delivered out of
2240 order is highly variable, and is unlikely to be zero. The level 2237 order is highly variable, and is unlikely to be zero. The level
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0307e2875f21..9028b879a97b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -376,9 +376,17 @@ tcp_orphan_retries - INTEGER
376 may consume significant resources. Cf. tcp_max_orphans. 376 may consume significant resources. Cf. tcp_max_orphans.
377 377
378tcp_reordering - INTEGER 378tcp_reordering - INTEGER
379 Maximal reordering of packets in a TCP stream. 379 Initial reordering level of packets in a TCP stream.
380 TCP stack can then dynamically adjust flow reordering level
381 between this initial value and tcp_max_reordering
380 Default: 3 382 Default: 3
381 383
384tcp_max_reordering - INTEGER
385 Maximal reordering level of packets in a TCP stream.
386 300 is a fairly conservative value, but you might increase it
387 if paths are using per packet load balancing (like bonding rr mode)
388 Default: 300
389
382tcp_retrans_collapse - BOOLEAN 390tcp_retrans_collapse - BOOLEAN
383 Bug-to-bug compatibility with some broken printers. 391 Bug-to-bug compatibility with some broken printers.
384 On retransmit try to send bigger packets to work around bugs in 392 On retransmit try to send bigger packets to work around bugs in
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c2dee7deefa8..f566b8567892 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -204,10 +204,10 @@ struct tcp_sock {
204 204
205 u16 urg_data; /* Saved octet of OOB data and control flags */ 205 u16 urg_data; /* Saved octet of OOB data and control flags */
206 u8 ecn_flags; /* ECN status bits. */ 206 u8 ecn_flags; /* ECN status bits. */
207 u8 reordering; /* Packet reordering metric. */ 207 u8 keepalive_probes; /* num of allowed keep alive probes */
208 u32 reordering; /* Packet reordering metric. */
208 u32 snd_up; /* Urgent pointer */ 209 u32 snd_up; /* Urgent pointer */
209 210
210 u8 keepalive_probes; /* num of allowed keep alive probes */
211/* 211/*
212 * Options received (usually on last packet, some only on SYN packets). 212 * Options received (usually on last packet, some only on SYN packets).
213 */ 213 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c73fc145ee45..3a35b1500359 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -70,9 +70,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
70/* After receiving this amount of duplicate ACKs fast retransmit starts. */ 70/* After receiving this amount of duplicate ACKs fast retransmit starts. */
71#define TCP_FASTRETRANS_THRESH 3 71#define TCP_FASTRETRANS_THRESH 3
72 72
73/* Maximal reordering. */
74#define TCP_MAX_REORDERING 127
75
76/* Maximal number of ACKs sent quickly to accelerate slow-start. */ 73/* Maximal number of ACKs sent quickly to accelerate slow-start. */
77#define TCP_MAX_QUICKACKS 16U 74#define TCP_MAX_QUICKACKS 16U
78 75
@@ -252,6 +249,7 @@ extern int sysctl_tcp_abort_on_overflow;
252extern int sysctl_tcp_max_orphans; 249extern int sysctl_tcp_max_orphans;
253extern int sysctl_tcp_fack; 250extern int sysctl_tcp_fack;
254extern int sysctl_tcp_reordering; 251extern int sysctl_tcp_reordering;
252extern int sysctl_tcp_max_reordering;
255extern int sysctl_tcp_dsack; 253extern int sysctl_tcp_dsack;
256extern long sysctl_tcp_mem[3]; 254extern long sysctl_tcp_mem[3];
257extern int sysctl_tcp_wmem[3]; 255extern int sysctl_tcp_wmem[3];
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b3c53c8b331e..e0ee384a448f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
496 .proc_handler = proc_dointvec 496 .proc_handler = proc_dointvec
497 }, 497 },
498 { 498 {
499 .procname = "tcp_max_reordering",
500 .data = &sysctl_tcp_max_reordering,
501 .maxlen = sizeof(int),
502 .mode = 0644,
503 .proc_handler = proc_dointvec
504 },
505 {
499 .procname = "tcp_dsack", 506 .procname = "tcp_dsack",
500 .data = &sysctl_tcp_dsack, 507 .data = &sysctl_tcp_dsack,
501 .maxlen = sizeof(int), 508 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a12b455928e5..9a18cdd633f3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
81int sysctl_tcp_sack __read_mostly = 1; 81int sysctl_tcp_sack __read_mostly = 1;
82int sysctl_tcp_fack __read_mostly = 1; 82int sysctl_tcp_fack __read_mostly = 1;
83int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 83int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
84int sysctl_tcp_max_reordering __read_mostly = 300;
84EXPORT_SYMBOL(sysctl_tcp_reordering); 85EXPORT_SYMBOL(sysctl_tcp_reordering);
85int sysctl_tcp_dsack __read_mostly = 1; 86int sysctl_tcp_dsack __read_mostly = 1;
86int sysctl_tcp_app_win __read_mostly = 31; 87int sysctl_tcp_app_win __read_mostly = 31;
@@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
833 if (metric > tp->reordering) { 834 if (metric > tp->reordering) {
834 int mib_idx; 835 int mib_idx;
835 836
836 tp->reordering = min(TCP_MAX_REORDERING, metric); 837 tp->reordering = min(sysctl_tcp_max_reordering, metric);
837 838
838 /* This exciting event is worth to be remembered. 8) */ 839 /* This exciting event is worth to be remembered. 8) */
839 if (ts) 840 if (ts)