diff options
author | Eric Dumazet <edumazet@google.com> | 2014-10-28 00:45:24 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-10-29 15:05:15 -0400 |
commit | dca145ffaa8d39ea1904491ac81b92b7049372c0 (patch) | |
tree | ebe67ece694b9e54281047a6037b7d5776f9f56b | |
parent | 7aef06db0f91c7b48305d07b62edf43179adb28c (diff) |
tcp: allow for bigger reordering level
While testing upcoming Yaogong patch (converting out of order queue
into an RB tree), I hit the max reordering level of linux TCP stack.
Reordering level was limited to 127 for no good reason, and some
network setups [1] can easily reach this limit and get limited
throughput.
Allow a new max limit of 300, and add a sysctl to allow admins to even
allow bigger (or lower) values if needed.
[1] Aggregation of links, per packet load balancing, fabrics not doing
deep packet inspections, alternative TCP congestion modules...
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yaogong Wang <wygivan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/bonding.txt | 7 | ||||
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 10 | ||||
-rw-r--r-- | include/linux/tcp.h | 4 | ||||
-rw-r--r-- | include/net/tcp.h | 4 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 3 |
6 files changed, 23 insertions, 12 deletions
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index eeb5b2e97bed..83bf4986baea 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt | |||
@@ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single | |||
2230 | 2230 | ||
2231 | It is possible to adjust TCP/IP's congestion limits by | 2231 | It is possible to adjust TCP/IP's congestion limits by |
2232 | altering the net.ipv4.tcp_reordering sysctl parameter. The | 2232 | altering the net.ipv4.tcp_reordering sysctl parameter. The |
2233 | usual default value is 3, and the maximum useful value is 127. | 2233 | usual default value is 3. But keep in mind TCP stack is able |
2234 | For a four interface balance-rr bond, expect that a single | 2234 | to automatically increase this when it detects reorders. |
2235 | TCP/IP stream will utilize no more than approximately 2.3 | ||
2236 | interface's worth of throughput, even after adjusting | ||
2237 | tcp_reordering. | ||
2238 | 2235 | ||
2239 | Note that the fraction of packets that will be delivered out of | 2236 | Note that the fraction of packets that will be delivered out of |
2240 | order is highly variable, and is unlikely to be zero. The level | 2237 | order is highly variable, and is unlikely to be zero. The level |
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 0307e2875f21..9028b879a97b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -376,9 +376,17 @@ tcp_orphan_retries - INTEGER | |||
376 | may consume significant resources. Cf. tcp_max_orphans. | 376 | may consume significant resources. Cf. tcp_max_orphans. |
377 | 377 | ||
378 | tcp_reordering - INTEGER | 378 | tcp_reordering - INTEGER |
379 | Maximal reordering of packets in a TCP stream. | 379 | Initial reordering level of packets in a TCP stream. |
380 | TCP stack can then dynamically adjust flow reordering level | ||
381 | between this initial value and tcp_max_reordering | ||
380 | Default: 3 | 382 | Default: 3 |
381 | 383 | ||
384 | tcp_max_reordering - INTEGER | ||
385 | Maximal reordering level of packets in a TCP stream. | ||
386 | 300 is a fairly conservative value, but you might increase it | ||
387 | if paths are using per packet load balancing (like bonding rr mode) | ||
388 | Default: 300 | ||
389 | |||
382 | tcp_retrans_collapse - BOOLEAN | 390 | tcp_retrans_collapse - BOOLEAN |
383 | Bug-to-bug compatibility with some broken printers. | 391 | Bug-to-bug compatibility with some broken printers. |
384 | On retransmit try to send bigger packets to work around bugs in | 392 | On retransmit try to send bigger packets to work around bugs in |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c2dee7deefa8..f566b8567892 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -204,10 +204,10 @@ struct tcp_sock { | |||
204 | 204 | ||
205 | u16 urg_data; /* Saved octet of OOB data and control flags */ | 205 | u16 urg_data; /* Saved octet of OOB data and control flags */ |
206 | u8 ecn_flags; /* ECN status bits. */ | 206 | u8 ecn_flags; /* ECN status bits. */ |
207 | u8 reordering; /* Packet reordering metric. */ | 207 | u8 keepalive_probes; /* num of allowed keep alive probes */ |
208 | u32 reordering; /* Packet reordering metric. */ | ||
208 | u32 snd_up; /* Urgent pointer */ | 209 | u32 snd_up; /* Urgent pointer */ |
209 | 210 | ||
210 | u8 keepalive_probes; /* num of allowed keep alive probes */ | ||
211 | /* | 211 | /* |
212 | * Options received (usually on last packet, some only on SYN packets). | 212 | * Options received (usually on last packet, some only on SYN packets). |
213 | */ | 213 | */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index c73fc145ee45..3a35b1500359 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -70,9 +70,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); | |||
70 | /* After receiving this amount of duplicate ACKs fast retransmit starts. */ | 70 | /* After receiving this amount of duplicate ACKs fast retransmit starts. */ |
71 | #define TCP_FASTRETRANS_THRESH 3 | 71 | #define TCP_FASTRETRANS_THRESH 3 |
72 | 72 | ||
73 | /* Maximal reordering. */ | ||
74 | #define TCP_MAX_REORDERING 127 | ||
75 | |||
76 | /* Maximal number of ACKs sent quickly to accelerate slow-start. */ | 73 | /* Maximal number of ACKs sent quickly to accelerate slow-start. */ |
77 | #define TCP_MAX_QUICKACKS 16U | 74 | #define TCP_MAX_QUICKACKS 16U |
78 | 75 | ||
@@ -252,6 +249,7 @@ extern int sysctl_tcp_abort_on_overflow; | |||
252 | extern int sysctl_tcp_max_orphans; | 249 | extern int sysctl_tcp_max_orphans; |
253 | extern int sysctl_tcp_fack; | 250 | extern int sysctl_tcp_fack; |
254 | extern int sysctl_tcp_reordering; | 251 | extern int sysctl_tcp_reordering; |
252 | extern int sysctl_tcp_max_reordering; | ||
255 | extern int sysctl_tcp_dsack; | 253 | extern int sysctl_tcp_dsack; |
256 | extern long sysctl_tcp_mem[3]; | 254 | extern long sysctl_tcp_mem[3]; |
257 | extern int sysctl_tcp_wmem[3]; | 255 | extern int sysctl_tcp_wmem[3]; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b3c53c8b331e..e0ee384a448f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { | |||
496 | .proc_handler = proc_dointvec | 496 | .proc_handler = proc_dointvec |
497 | }, | 497 | }, |
498 | { | 498 | { |
499 | .procname = "tcp_max_reordering", | ||
500 | .data = &sysctl_tcp_max_reordering, | ||
501 | .maxlen = sizeof(int), | ||
502 | .mode = 0644, | ||
503 | .proc_handler = proc_dointvec | ||
504 | }, | ||
505 | { | ||
499 | .procname = "tcp_dsack", | 506 | .procname = "tcp_dsack", |
500 | .data = &sysctl_tcp_dsack, | 507 | .data = &sysctl_tcp_dsack, |
501 | .maxlen = sizeof(int), | 508 | .maxlen = sizeof(int), |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a12b455928e5..9a18cdd633f3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; | |||
81 | int sysctl_tcp_sack __read_mostly = 1; | 81 | int sysctl_tcp_sack __read_mostly = 1; |
82 | int sysctl_tcp_fack __read_mostly = 1; | 82 | int sysctl_tcp_fack __read_mostly = 1; |
83 | int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; | 83 | int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; |
84 | int sysctl_tcp_max_reordering __read_mostly = 300; | ||
84 | EXPORT_SYMBOL(sysctl_tcp_reordering); | 85 | EXPORT_SYMBOL(sysctl_tcp_reordering); |
85 | int sysctl_tcp_dsack __read_mostly = 1; | 86 | int sysctl_tcp_dsack __read_mostly = 1; |
86 | int sysctl_tcp_app_win __read_mostly = 31; | 87 | int sysctl_tcp_app_win __read_mostly = 31; |
@@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | |||
833 | if (metric > tp->reordering) { | 834 | if (metric > tp->reordering) { |
834 | int mib_idx; | 835 | int mib_idx; |
835 | 836 | ||
836 | tp->reordering = min(TCP_MAX_REORDERING, metric); | 837 | tp->reordering = min(sysctl_tcp_max_reordering, metric); |
837 | 838 | ||
838 | /* This exciting event is worth to be remembered. 8) */ | 839 | /* This exciting event is worth to be remembered. 8) */ |
839 | if (ts) | 840 | if (ts) |