diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 13 | ||||
-rw-r--r-- | include/linux/tcp.h | 1 | ||||
-rw-r--r-- | include/net/sock.h | 19 | ||||
-rw-r--r-- | include/net/tcp.h | 14 | ||||
-rw-r--r-- | include/uapi/linux/tcp.h | 1 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 3 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
10 files changed, 61 insertions, 6 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 10742902146f..53cea9bcb14c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max | |||
516 | this value is ignored. | 516 | this value is ignored. |
517 | Default: between 64K and 4MB, depending on RAM size. | 517 | Default: between 64K and 4MB, depending on RAM size. |
518 | 518 | ||
519 | tcp_notsent_lowat - UNSIGNED INTEGER | ||
520 | A TCP socket can control the amount of unsent bytes in its write queue, | ||
521 | thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() | ||
522 | reports POLLOUT events if the amount of unsent bytes is below a per | ||
523 | socket value, and if the write queue is not full. sendmsg() will | ||
524 | also not add new buffers if the limit is hit. | ||
525 | |||
526 | This global variable controls the amount of unsent data for | ||
527 | sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change | ||
528 | to the global variable has immediate effect. | ||
529 | |||
530 | Default: UINT_MAX (0xFFFFFFFF) | ||
531 | |||
519 | tcp_workaround_signed_windows - BOOLEAN | 532 | tcp_workaround_signed_windows - BOOLEAN |
520 | If set, assume no receipt of a window scaling option means the | 533 | If set, assume no receipt of a window scaling option means the |
521 | remote TCP is broken and treats the window as a signed quantity. | 534 | remote TCP is broken and treats the window as a signed quantity. |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 472120b4fac5..9640803a17a7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -238,6 +238,7 @@ struct tcp_sock { | |||
238 | 238 | ||
239 | u32 rcv_wnd; /* Current receiver window */ | 239 | u32 rcv_wnd; /* Current receiver window */ |
240 | u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ | 240 | u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ |
241 | u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ | ||
241 | u32 pushed_seq; /* Last pushed seq, required to talk to windows */ | 242 | u32 pushed_seq; /* Last pushed seq, required to talk to windows */ |
242 | u32 lost_out; /* Lost packets */ | 243 | u32 lost_out; /* Lost packets */ |
243 | u32 sacked_out; /* SACK'd packets */ | 244 | u32 sacked_out; /* SACK'd packets */ |
diff --git a/include/net/sock.h b/include/net/sock.h index d0b5fdee50a2..b9f2b095b1ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk) | |||
746 | 746 | ||
747 | extern void sk_stream_write_space(struct sock *sk); | 747 | extern void sk_stream_write_space(struct sock *sk); |
748 | 748 | ||
749 | static inline bool sk_stream_memory_free(const struct sock *sk) | ||
750 | { | ||
751 | return sk->sk_wmem_queued < sk->sk_sndbuf; | ||
752 | } | ||
753 | |||
754 | /* OOB backlog add */ | 749 | /* OOB backlog add */ |
755 | static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) | 750 | static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) |
756 | { | 751 | { |
@@ -950,6 +945,7 @@ struct proto { | |||
950 | unsigned int inuse_idx; | 945 | unsigned int inuse_idx; |
951 | #endif | 946 | #endif |
952 | 947 | ||
948 | bool (*stream_memory_free)(const struct sock *sk); | ||
953 | /* Memory pressure */ | 949 | /* Memory pressure */ |
954 | void (*enter_memory_pressure)(struct sock *sk); | 950 | void (*enter_memory_pressure)(struct sock *sk); |
955 | atomic_long_t *memory_allocated; /* Current allocated memory. */ | 951 | atomic_long_t *memory_allocated; /* Current allocated memory. */ |
@@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto, | |||
1088 | } | 1084 | } |
1089 | #endif | 1085 | #endif |
1090 | 1086 | ||
1087 | static inline bool sk_stream_memory_free(const struct sock *sk) | ||
1088 | { | ||
1089 | if (sk->sk_wmem_queued >= sk->sk_sndbuf) | ||
1090 | return false; | ||
1091 | |||
1092 | return sk->sk_prot->stream_memory_free ? | ||
1093 | sk->sk_prot->stream_memory_free(sk) : true; | ||
1094 | } | ||
1095 | |||
1091 | static inline bool sk_stream_is_writeable(const struct sock *sk) | 1096 | static inline bool sk_stream_is_writeable(const struct sock *sk) |
1092 | { | 1097 | { |
1093 | return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); | 1098 | return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && |
1099 | sk_stream_memory_free(sk); | ||
1094 | } | 1100 | } |
1095 | 1101 | ||
1102 | |||
1096 | static inline bool sk_has_memory_pressure(const struct sock *sk) | 1103 | static inline bool sk_has_memory_pressure(const struct sock *sk) |
1097 | { | 1104 | { |
1098 | return sk->sk_prot->memory_pressure != NULL; | 1105 | return sk->sk_prot->memory_pressure != NULL; |
diff --git a/include/net/tcp.h b/include/net/tcp.h index c5868471abae..18fc999dae3c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack; | |||
284 | extern int sysctl_tcp_early_retrans; | 284 | extern int sysctl_tcp_early_retrans; |
285 | extern int sysctl_tcp_limit_output_bytes; | 285 | extern int sysctl_tcp_limit_output_bytes; |
286 | extern int sysctl_tcp_challenge_ack_limit; | 286 | extern int sysctl_tcp_challenge_ack_limit; |
287 | extern unsigned int sysctl_tcp_notsent_lowat; | ||
287 | 288 | ||
288 | extern atomic_long_t tcp_memory_allocated; | 289 | extern atomic_long_t tcp_memory_allocated; |
289 | extern struct percpu_counter tcp_sockets_allocated; | 290 | extern struct percpu_counter tcp_sockets_allocated; |
@@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb); | |||
1539 | extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, | 1540 | extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, |
1540 | __be32 daddr); | 1541 | __be32 daddr); |
1541 | 1542 | ||
1543 | static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) | ||
1544 | { | ||
1545 | return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat; | ||
1546 | } | ||
1547 | |||
1548 | static inline bool tcp_stream_memory_free(const struct sock *sk) | ||
1549 | { | ||
1550 | const struct tcp_sock *tp = tcp_sk(sk); | ||
1551 | u32 notsent_bytes = tp->write_seq - tp->snd_nxt; | ||
1552 | |||
1553 | return notsent_bytes < tcp_notsent_lowat(tp); | ||
1554 | } | ||
1555 | |||
1542 | #ifdef CONFIG_PROC_FS | 1556 | #ifdef CONFIG_PROC_FS |
1543 | extern int tcp4_proc_init(void); | 1557 | extern int tcp4_proc_init(void); |
1544 | extern void tcp4_proc_exit(void); | 1558 | extern void tcp4_proc_exit(void); |
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8d776ebc4829..377f1e59411d 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h | |||
@@ -111,6 +111,7 @@ enum { | |||
111 | #define TCP_REPAIR_OPTIONS 22 | 111 | #define TCP_REPAIR_OPTIONS 22 |
112 | #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ | 112 | #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ |
113 | #define TCP_TIMESTAMP 24 | 113 | #define TCP_TIMESTAMP 24 |
114 | #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ | ||
114 | 115 | ||
115 | struct tcp_repair_opt { | 116 | struct tcp_repair_opt { |
116 | __u32 opt_code; | 117 | __u32 opt_code; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b2c123c44d69..69ed203802da 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -555,6 +555,13 @@ static struct ctl_table ipv4_table[] = { | |||
555 | .extra1 = &one, | 555 | .extra1 = &one, |
556 | }, | 556 | }, |
557 | { | 557 | { |
558 | .procname = "tcp_notsent_lowat", | ||
559 | .data = &sysctl_tcp_notsent_lowat, | ||
560 | .maxlen = sizeof(sysctl_tcp_notsent_lowat), | ||
561 | .mode = 0644, | ||
562 | .proc_handler = proc_dointvec, | ||
563 | }, | ||
564 | { | ||
558 | .procname = "tcp_rmem", | 565 | .procname = "tcp_rmem", |
559 | .data = &sysctl_tcp_rmem, | 566 | .data = &sysctl_tcp_rmem, |
560 | .maxlen = sizeof(sysctl_tcp_rmem), | 567 | .maxlen = sizeof(sysctl_tcp_rmem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5eca9060bb8e..c27e81392398 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2631 | else | 2631 | else |
2632 | tp->tsoffset = val - tcp_time_stamp; | 2632 | tp->tsoffset = val - tcp_time_stamp; |
2633 | break; | 2633 | break; |
2634 | case TCP_NOTSENT_LOWAT: | ||
2635 | tp->notsent_lowat = val; | ||
2636 | sk->sk_write_space(sk); | ||
2637 | break; | ||
2634 | default: | 2638 | default: |
2635 | err = -ENOPROTOOPT; | 2639 | err = -ENOPROTOOPT; |
2636 | break; | 2640 | break; |
@@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2847 | case TCP_TIMESTAMP: | 2851 | case TCP_TIMESTAMP: |
2848 | val = tcp_time_stamp + tp->tsoffset; | 2852 | val = tcp_time_stamp + tp->tsoffset; |
2849 | break; | 2853 | break; |
2854 | case TCP_NOTSENT_LOWAT: | ||
2855 | val = tp->notsent_lowat; | ||
2856 | break; | ||
2850 | default: | 2857 | default: |
2851 | return -ENOPROTOOPT; | 2858 | return -ENOPROTOOPT; |
2852 | } | 2859 | } |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2e3f129df0eb..2a5d5c469d17 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2800,6 +2800,7 @@ struct proto tcp_prot = { | |||
2800 | .unhash = inet_unhash, | 2800 | .unhash = inet_unhash, |
2801 | .get_port = inet_csk_get_port, | 2801 | .get_port = inet_csk_get_port, |
2802 | .enter_memory_pressure = tcp_enter_memory_pressure, | 2802 | .enter_memory_pressure = tcp_enter_memory_pressure, |
2803 | .stream_memory_free = tcp_stream_memory_free, | ||
2803 | .sockets_allocated = &tcp_sockets_allocated, | 2804 | .sockets_allocated = &tcp_sockets_allocated, |
2804 | .orphan_count = &tcp_orphan_count, | 2805 | .orphan_count = &tcp_orphan_count, |
2805 | .memory_allocated = &tcp_memory_allocated, | 2806 | .memory_allocated = &tcp_memory_allocated, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa82..884efff5b531 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; | |||
65 | /* By default, RFC2861 behavior. */ | 65 | /* By default, RFC2861 behavior. */ |
66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
67 | 67 | ||
68 | unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; | ||
69 | EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); | ||
70 | |||
68 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | 71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
69 | int push_one, gfp_t gfp); | 72 | int push_one, gfp_t gfp); |
70 | 73 | ||
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 80fe69ef2188..b792e870686b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = { | |||
1924 | .unhash = inet_unhash, | 1924 | .unhash = inet_unhash, |
1925 | .get_port = inet_csk_get_port, | 1925 | .get_port = inet_csk_get_port, |
1926 | .enter_memory_pressure = tcp_enter_memory_pressure, | 1926 | .enter_memory_pressure = tcp_enter_memory_pressure, |
1927 | .stream_memory_free = tcp_stream_memory_free, | ||
1927 | .sockets_allocated = &tcp_sockets_allocated, | 1928 | .sockets_allocated = &tcp_sockets_allocated, |
1928 | .memory_allocated = &tcp_memory_allocated, | 1929 | .memory_allocated = &tcp_memory_allocated, |
1929 | .memory_pressure = &tcp_memory_pressure, | 1930 | .memory_pressure = &tcp_memory_pressure, |