aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/inet_connection_sock.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r--net/ipv4/inet_connection_sock.c268
1 files changed, 131 insertions, 137 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 46b9c887bede..bc5196ea1bdf 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,7 @@
24#include <net/tcp_states.h> 24#include <net/tcp_states.h>
25#include <net/xfrm.h> 25#include <net/xfrm.h>
26#include <net/tcp.h> 26#include <net/tcp.h>
27#include <net/sock_reuseport.h>
27 28
28#ifdef INET_CSK_DEBUG 29#ifdef INET_CSK_DEBUG
29const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; 30const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
67 if ((!reuse || !sk2->sk_reuse || 68 if ((!reuse || !sk2->sk_reuse ||
68 sk2->sk_state == TCP_LISTEN) && 69 sk2->sk_state == TCP_LISTEN) &&
69 (!reuseport || !sk2->sk_reuseport || 70 (!reuseport || !sk2->sk_reuseport ||
70 (sk2->sk_state != TCP_TIME_WAIT && 71 rcu_access_pointer(sk->sk_reuseport_cb) ||
72 (sk2->sk_state != TCP_TIME_WAIT &&
71 !uid_eq(uid, sock_i_uid(sk2))))) { 73 !uid_eq(uid, sock_i_uid(sk2))))) {
72 74
73 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || 75 if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
89 91
90/* Obtain a reference to a local port for the given sock, 92/* Obtain a reference to a local port for the given sock,
91 * if snum is zero it means select any available local port. 93 * if snum is zero it means select any available local port.
94 * We try to allocate an odd port (and leave even ports for connect())
92 */ 95 */
93int inet_csk_get_port(struct sock *sk, unsigned short snum) 96int inet_csk_get_port(struct sock *sk, unsigned short snum)
94{ 97{
95 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 98 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
99 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
100 int ret = 1, attempts = 5, port = snum;
101 int smallest_size = -1, smallest_port;
96 struct inet_bind_hashbucket *head; 102 struct inet_bind_hashbucket *head;
97 struct inet_bind_bucket *tb;
98 int ret, attempts = 5;
99 struct net *net = sock_net(sk); 103 struct net *net = sock_net(sk);
100 int smallest_size = -1, smallest_rover; 104 int i, low, high, attempt_half;
105 struct inet_bind_bucket *tb;
101 kuid_t uid = sock_i_uid(sk); 106 kuid_t uid = sock_i_uid(sk);
102 int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; 107 u32 remaining, offset;
103 108
104 local_bh_disable(); 109 if (port) {
105 if (!snum) { 110have_port:
106 int remaining, rover, low, high; 111 head = &hinfo->bhash[inet_bhashfn(net, port,
112 hinfo->bhash_size)];
113 spin_lock_bh(&head->lock);
114 inet_bind_bucket_for_each(tb, &head->chain)
115 if (net_eq(ib_net(tb), net) && tb->port == port)
116 goto tb_found;
107 117
118 goto tb_not_found;
119 }
108again: 120again:
109 inet_get_local_port_range(net, &low, &high); 121 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
110 if (attempt_half) { 122other_half_scan:
111 int half = low + ((high - low) >> 1); 123 inet_get_local_port_range(net, &low, &high);
112 124 high++; /* [32768, 60999] -> [32768, 61000[ */
113 if (attempt_half == 1) 125 if (high - low < 4)
114 high = half; 126 attempt_half = 0;
115 else 127 if (attempt_half) {
116 low = half; 128 int half = low + (((high - low) >> 2) << 1);
117 } 129
118 remaining = (high - low) + 1; 130 if (attempt_half == 1)
119 smallest_rover = rover = prandom_u32() % remaining + low; 131 high = half;
120 132 else
121 smallest_size = -1; 133 low = half;
122 do { 134 }
123 if (inet_is_local_reserved_port(net, rover)) 135 remaining = high - low;
124 goto next_nolock; 136 if (likely(remaining > 1))
125 head = &hashinfo->bhash[inet_bhashfn(net, rover, 137 remaining &= ~1U;
126 hashinfo->bhash_size)]; 138
127 spin_lock(&head->lock); 139 offset = prandom_u32() % remaining;
128 inet_bind_bucket_for_each(tb, &head->chain) 140 /* __inet_hash_connect() favors ports having @low parity
129 if (net_eq(ib_net(tb), net) && tb->port == rover) { 141 * We do the opposite to not pollute connect() users.
130 if (((tb->fastreuse > 0 && 142 */
131 sk->sk_reuse && 143 offset |= 1U;
132 sk->sk_state != TCP_LISTEN) || 144 smallest_size = -1;
133 (tb->fastreuseport > 0 && 145 smallest_port = low; /* avoid compiler warning */
134 sk->sk_reuseport && 146
135 uid_eq(tb->fastuid, uid))) && 147other_parity_scan:
136 (tb->num_owners < smallest_size || smallest_size == -1)) { 148 port = low + offset;
137 smallest_size = tb->num_owners; 149 for (i = 0; i < remaining; i += 2, port += 2) {
138 smallest_rover = rover; 150 if (unlikely(port >= high))
139 } 151 port -= remaining;
140 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { 152 if (inet_is_local_reserved_port(net, port))
141 snum = rover; 153 continue;
142 goto tb_found; 154 head = &hinfo->bhash[inet_bhashfn(net, port,
143 } 155 hinfo->bhash_size)];
144 goto next; 156 spin_lock_bh(&head->lock);
157 inet_bind_bucket_for_each(tb, &head->chain)
158 if (net_eq(ib_net(tb), net) && tb->port == port) {
159 if (((tb->fastreuse > 0 && reuse) ||
160 (tb->fastreuseport > 0 &&
161 sk->sk_reuseport &&
162 !rcu_access_pointer(sk->sk_reuseport_cb) &&
163 uid_eq(tb->fastuid, uid))) &&
164 (tb->num_owners < smallest_size || smallest_size == -1)) {
165 smallest_size = tb->num_owners;
166 smallest_port = port;
145 } 167 }
146 break; 168 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
147 next: 169 goto tb_found;
148 spin_unlock(&head->lock); 170 goto next_port;
149 next_nolock:
150 if (++rover > high)
151 rover = low;
152 } while (--remaining > 0);
153
154 /* Exhausted local port range during search? It is not
155 * possible for us to be holding one of the bind hash
156 * locks if this test triggers, because if 'remaining'
157 * drops to zero, we broke out of the do/while loop at
158 * the top level, not from the 'break;' statement.
159 */
160 ret = 1;
161 if (remaining <= 0) {
162 if (smallest_size != -1) {
163 snum = smallest_rover;
164 goto have_snum;
165 }
166 if (attempt_half == 1) {
167 /* OK we now try the upper half of the range */
168 attempt_half = 2;
169 goto again;
170 } 171 }
171 goto fail; 172 goto tb_not_found;
172 } 173next_port:
173 /* OK, here is the one we will use. HEAD is 174 spin_unlock_bh(&head->lock);
174 * non-NULL and we hold it's mutex. 175 cond_resched();
175 */ 176 }
176 snum = rover; 177
177 } else { 178 if (smallest_size != -1) {
178have_snum: 179 port = smallest_port;
179 head = &hashinfo->bhash[inet_bhashfn(net, snum, 180 goto have_port;
180 hashinfo->bhash_size)]; 181 }
181 spin_lock(&head->lock); 182 offset--;
182 inet_bind_bucket_for_each(tb, &head->chain) 183 if (!(offset & 1))
183 if (net_eq(ib_net(tb), net) && tb->port == snum) 184 goto other_parity_scan;
184 goto tb_found; 185
186 if (attempt_half == 1) {
187 /* OK we now try the upper half of the range */
188 attempt_half = 2;
189 goto other_half_scan;
185 } 190 }
186 tb = NULL; 191 return ret;
187 goto tb_not_found; 192
193tb_not_found:
194 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
195 net, head, port);
196 if (!tb)
197 goto fail_unlock;
188tb_found: 198tb_found:
189 if (!hlist_empty(&tb->owners)) { 199 if (!hlist_empty(&tb->owners)) {
190 if (sk->sk_reuse == SK_FORCE_REUSE) 200 if (sk->sk_reuse == SK_FORCE_REUSE)
191 goto success; 201 goto success;
192 202
193 if (((tb->fastreuse > 0 && 203 if (((tb->fastreuse > 0 && reuse) ||
194 sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
195 (tb->fastreuseport > 0 && 204 (tb->fastreuseport > 0 &&
205 !rcu_access_pointer(sk->sk_reuseport_cb) &&
196 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && 206 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
197 smallest_size == -1) { 207 smallest_size == -1)
198 goto success; 208 goto success;
199 } else { 209 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
200 ret = 1; 210 if ((reuse ||
201 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { 211 (tb->fastreuseport > 0 &&
202 if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || 212 sk->sk_reuseport &&
203 (tb->fastreuseport > 0 && 213 !rcu_access_pointer(sk->sk_reuseport_cb) &&
204 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && 214 uid_eq(tb->fastuid, uid))) &&
205 smallest_size != -1 && --attempts >= 0) { 215 smallest_size != -1 && --attempts >= 0) {
206 spin_unlock(&head->lock); 216 spin_unlock_bh(&head->lock);
207 goto again; 217 goto again;
208 }
209
210 goto fail_unlock;
211 } 218 }
219 goto fail_unlock;
212 } 220 }
213 } 221 if (!reuse)
214tb_not_found:
215 ret = 1;
216 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
217 net, head, snum)) == NULL)
218 goto fail_unlock;
219 if (hlist_empty(&tb->owners)) {
220 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
221 tb->fastreuse = 1;
222 else
223 tb->fastreuse = 0; 222 tb->fastreuse = 0;
223 if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
224 tb->fastreuseport = 0;
225 } else {
226 tb->fastreuse = reuse;
224 if (sk->sk_reuseport) { 227 if (sk->sk_reuseport) {
225 tb->fastreuseport = 1; 228 tb->fastreuseport = 1;
226 tb->fastuid = uid; 229 tb->fastuid = uid;
227 } else 230 } else {
228 tb->fastreuseport = 0;
229 } else {
230 if (tb->fastreuse &&
231 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
232 tb->fastreuse = 0;
233 if (tb->fastreuseport &&
234 (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
235 tb->fastreuseport = 0; 231 tb->fastreuseport = 0;
232 }
236 } 233 }
237success: 234success:
238 if (!inet_csk(sk)->icsk_bind_hash) 235 if (!inet_csk(sk)->icsk_bind_hash)
239 inet_bind_hash(sk, tb, snum); 236 inet_bind_hash(sk, tb, port);
240 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 237 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
241 ret = 0; 238 ret = 0;
242 239
243fail_unlock: 240fail_unlock:
244 spin_unlock(&head->lock); 241 spin_unlock_bh(&head->lock);
245fail:
246 local_bh_enable();
247 return ret; 242 return ret;
248} 243}
249EXPORT_SYMBOL_GPL(inet_csk_get_port); 244EXPORT_SYMBOL_GPL(inet_csk_get_port);
@@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
482#define AF_INET_FAMILY(fam) true 477#define AF_INET_FAMILY(fam) true
483#endif 478#endif
484 479
485/* Only thing we need from tcp.h */
486extern int sysctl_tcp_synack_retries;
487
488
489/* Decide when to expire the request and when to resend SYN-ACK */ 480/* Decide when to expire the request and when to resend SYN-ACK */
490static inline void syn_ack_recalc(struct request_sock *req, const int thresh, 481static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
491 const int max_retries, 482 const int max_retries,
@@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data)
557{ 548{
558 struct request_sock *req = (struct request_sock *)data; 549 struct request_sock *req = (struct request_sock *)data;
559 struct sock *sk_listener = req->rsk_listener; 550 struct sock *sk_listener = req->rsk_listener;
551 struct net *net = sock_net(sk_listener);
560 struct inet_connection_sock *icsk = inet_csk(sk_listener); 552 struct inet_connection_sock *icsk = inet_csk(sk_listener);
561 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 553 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
562 int qlen, expire = 0, resend = 0; 554 int qlen, expire = 0, resend = 0;
@@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data)
566 if (sk_state_load(sk_listener) != TCP_LISTEN) 558 if (sk_state_load(sk_listener) != TCP_LISTEN)
567 goto drop; 559 goto drop;
568 560
569 max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; 561 max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
570 thresh = max_retries; 562 thresh = max_retries;
571 /* Normally all the openreqs are young and become mature 563 /* Normally all the openreqs are young and become mature
572 * (i.e. converted to established socket) for first timeout. 564 * (i.e. converted to established socket) for first timeout.
@@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
737{ 729{
738 struct inet_connection_sock *icsk = inet_csk(sk); 730 struct inet_connection_sock *icsk = inet_csk(sk);
739 struct inet_sock *inet = inet_sk(sk); 731 struct inet_sock *inet = inet_sk(sk);
732 int err = -EADDRINUSE;
740 733
741 reqsk_queue_alloc(&icsk->icsk_accept_queue); 734 reqsk_queue_alloc(&icsk->icsk_accept_queue);
742 735
@@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
754 inet->inet_sport = htons(inet->inet_num); 747 inet->inet_sport = htons(inet->inet_num);
755 748
756 sk_dst_reset(sk); 749 sk_dst_reset(sk);
757 sk->sk_prot->hash(sk); 750 err = sk->sk_prot->hash(sk);
758 751
759 return 0; 752 if (likely(!err))
753 return 0;
760 } 754 }
761 755
762 sk->sk_state = TCP_CLOSE; 756 sk->sk_state = TCP_CLOSE;
763 return -EADDRINUSE; 757 return err;
764} 758}
765EXPORT_SYMBOL_GPL(inet_csk_listen_start); 759EXPORT_SYMBOL_GPL(inet_csk_listen_start);
766 760
@@ -789,14 +783,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
789 reqsk_put(req); 783 reqsk_put(req);
790} 784}
791 785
792void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, 786struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
793 struct sock *child) 787 struct request_sock *req,
788 struct sock *child)
794{ 789{
795 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; 790 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
796 791
797 spin_lock(&queue->rskq_lock); 792 spin_lock(&queue->rskq_lock);
798 if (unlikely(sk->sk_state != TCP_LISTEN)) { 793 if (unlikely(sk->sk_state != TCP_LISTEN)) {
799 inet_child_forget(sk, req, child); 794 inet_child_forget(sk, req, child);
795 child = NULL;
800 } else { 796 } else {
801 req->sk = child; 797 req->sk = child;
802 req->dl_next = NULL; 798 req->dl_next = NULL;
@@ -808,6 +804,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
808 sk_acceptq_added(sk); 804 sk_acceptq_added(sk);
809 } 805 }
810 spin_unlock(&queue->rskq_lock); 806 spin_unlock(&queue->rskq_lock);
807 return child;
811} 808}
812EXPORT_SYMBOL(inet_csk_reqsk_queue_add); 809EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
813 810
@@ -817,11 +814,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
817 if (own_req) { 814 if (own_req) {
818 inet_csk_reqsk_queue_drop(sk, req); 815 inet_csk_reqsk_queue_drop(sk, req);
819 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); 816 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
820 inet_csk_reqsk_queue_add(sk, req, child); 817 if (inet_csk_reqsk_queue_add(sk, req, child))
821 /* Warning: caller must not call reqsk_put(req); 818 return child;
822 * child stole last reference on it.
823 */
824 return child;
825 } 819 }
826 /* Too bad, another child took ownership of the request, undo. */ 820 /* Too bad, another child took ownership of the request, undo. */
827 bh_unlock_sock(child); 821 bh_unlock_sock(child);