diff options
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 268 |
1 files changed, 131 insertions, 137 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 46b9c887bede..bc5196ea1bdf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <net/tcp_states.h> | 24 | #include <net/tcp_states.h> |
25 | #include <net/xfrm.h> | 25 | #include <net/xfrm.h> |
26 | #include <net/tcp.h> | 26 | #include <net/tcp.h> |
27 | #include <net/sock_reuseport.h> | ||
27 | 28 | ||
28 | #ifdef INET_CSK_DEBUG | 29 | #ifdef INET_CSK_DEBUG |
29 | const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; | 30 | const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; |
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
67 | if ((!reuse || !sk2->sk_reuse || | 68 | if ((!reuse || !sk2->sk_reuse || |
68 | sk2->sk_state == TCP_LISTEN) && | 69 | sk2->sk_state == TCP_LISTEN) && |
69 | (!reuseport || !sk2->sk_reuseport || | 70 | (!reuseport || !sk2->sk_reuseport || |
70 | (sk2->sk_state != TCP_TIME_WAIT && | 71 | rcu_access_pointer(sk->sk_reuseport_cb) || |
72 | (sk2->sk_state != TCP_TIME_WAIT && | ||
71 | !uid_eq(uid, sock_i_uid(sk2))))) { | 73 | !uid_eq(uid, sock_i_uid(sk2))))) { |
72 | 74 | ||
73 | if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || | 75 | if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || |
@@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); | |||
89 | 91 | ||
90 | /* Obtain a reference to a local port for the given sock, | 92 | /* Obtain a reference to a local port for the given sock, |
91 | * if snum is zero it means select any available local port. | 93 | * if snum is zero it means select any available local port. |
94 | * We try to allocate an odd port (and leave even ports for connect()) | ||
92 | */ | 95 | */ |
93 | int inet_csk_get_port(struct sock *sk, unsigned short snum) | 96 | int inet_csk_get_port(struct sock *sk, unsigned short snum) |
94 | { | 97 | { |
95 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 98 | bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; |
99 | struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; | ||
100 | int ret = 1, attempts = 5, port = snum; | ||
101 | int smallest_size = -1, smallest_port; | ||
96 | struct inet_bind_hashbucket *head; | 102 | struct inet_bind_hashbucket *head; |
97 | struct inet_bind_bucket *tb; | ||
98 | int ret, attempts = 5; | ||
99 | struct net *net = sock_net(sk); | 103 | struct net *net = sock_net(sk); |
100 | int smallest_size = -1, smallest_rover; | 104 | int i, low, high, attempt_half; |
105 | struct inet_bind_bucket *tb; | ||
101 | kuid_t uid = sock_i_uid(sk); | 106 | kuid_t uid = sock_i_uid(sk); |
102 | int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; | 107 | u32 remaining, offset; |
103 | 108 | ||
104 | local_bh_disable(); | 109 | if (port) { |
105 | if (!snum) { | 110 | have_port: |
106 | int remaining, rover, low, high; | 111 | head = &hinfo->bhash[inet_bhashfn(net, port, |
112 | hinfo->bhash_size)]; | ||
113 | spin_lock_bh(&head->lock); | ||
114 | inet_bind_bucket_for_each(tb, &head->chain) | ||
115 | if (net_eq(ib_net(tb), net) && tb->port == port) | ||
116 | goto tb_found; | ||
107 | 117 | ||
118 | goto tb_not_found; | ||
119 | } | ||
108 | again: | 120 | again: |
109 | inet_get_local_port_range(net, &low, &high); | 121 | attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; |
110 | if (attempt_half) { | 122 | other_half_scan: |
111 | int half = low + ((high - low) >> 1); | 123 | inet_get_local_port_range(net, &low, &high); |
112 | 124 | high++; /* [32768, 60999] -> [32768, 61000[ */ | |
113 | if (attempt_half == 1) | 125 | if (high - low < 4) |
114 | high = half; | 126 | attempt_half = 0; |
115 | else | 127 | if (attempt_half) { |
116 | low = half; | 128 | int half = low + (((high - low) >> 2) << 1); |
117 | } | 129 | |
118 | remaining = (high - low) + 1; | 130 | if (attempt_half == 1) |
119 | smallest_rover = rover = prandom_u32() % remaining + low; | 131 | high = half; |
120 | 132 | else | |
121 | smallest_size = -1; | 133 | low = half; |
122 | do { | 134 | } |
123 | if (inet_is_local_reserved_port(net, rover)) | 135 | remaining = high - low; |
124 | goto next_nolock; | 136 | if (likely(remaining > 1)) |
125 | head = &hashinfo->bhash[inet_bhashfn(net, rover, | 137 | remaining &= ~1U; |
126 | hashinfo->bhash_size)]; | 138 | |
127 | spin_lock(&head->lock); | 139 | offset = prandom_u32() % remaining; |
128 | inet_bind_bucket_for_each(tb, &head->chain) | 140 | /* __inet_hash_connect() favors ports having @low parity |
129 | if (net_eq(ib_net(tb), net) && tb->port == rover) { | 141 | * We do the opposite to not pollute connect() users. |
130 | if (((tb->fastreuse > 0 && | 142 | */ |
131 | sk->sk_reuse && | 143 | offset |= 1U; |
132 | sk->sk_state != TCP_LISTEN) || | 144 | smallest_size = -1; |
133 | (tb->fastreuseport > 0 && | 145 | smallest_port = low; /* avoid compiler warning */ |
134 | sk->sk_reuseport && | 146 | |
135 | uid_eq(tb->fastuid, uid))) && | 147 | other_parity_scan: |
136 | (tb->num_owners < smallest_size || smallest_size == -1)) { | 148 | port = low + offset; |
137 | smallest_size = tb->num_owners; | 149 | for (i = 0; i < remaining; i += 2, port += 2) { |
138 | smallest_rover = rover; | 150 | if (unlikely(port >= high)) |
139 | } | 151 | port -= remaining; |
140 | if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { | 152 | if (inet_is_local_reserved_port(net, port)) |
141 | snum = rover; | 153 | continue; |
142 | goto tb_found; | 154 | head = &hinfo->bhash[inet_bhashfn(net, port, |
143 | } | 155 | hinfo->bhash_size)]; |
144 | goto next; | 156 | spin_lock_bh(&head->lock); |
157 | inet_bind_bucket_for_each(tb, &head->chain) | ||
158 | if (net_eq(ib_net(tb), net) && tb->port == port) { | ||
159 | if (((tb->fastreuse > 0 && reuse) || | ||
160 | (tb->fastreuseport > 0 && | ||
161 | sk->sk_reuseport && | ||
162 | !rcu_access_pointer(sk->sk_reuseport_cb) && | ||
163 | uid_eq(tb->fastuid, uid))) && | ||
164 | (tb->num_owners < smallest_size || smallest_size == -1)) { | ||
165 | smallest_size = tb->num_owners; | ||
166 | smallest_port = port; | ||
145 | } | 167 | } |
146 | break; | 168 | if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) |
147 | next: | 169 | goto tb_found; |
148 | spin_unlock(&head->lock); | 170 | goto next_port; |
149 | next_nolock: | ||
150 | if (++rover > high) | ||
151 | rover = low; | ||
152 | } while (--remaining > 0); | ||
153 | |||
154 | /* Exhausted local port range during search? It is not | ||
155 | * possible for us to be holding one of the bind hash | ||
156 | * locks if this test triggers, because if 'remaining' | ||
157 | * drops to zero, we broke out of the do/while loop at | ||
158 | * the top level, not from the 'break;' statement. | ||
159 | */ | ||
160 | ret = 1; | ||
161 | if (remaining <= 0) { | ||
162 | if (smallest_size != -1) { | ||
163 | snum = smallest_rover; | ||
164 | goto have_snum; | ||
165 | } | ||
166 | if (attempt_half == 1) { | ||
167 | /* OK we now try the upper half of the range */ | ||
168 | attempt_half = 2; | ||
169 | goto again; | ||
170 | } | 171 | } |
171 | goto fail; | 172 | goto tb_not_found; |
172 | } | 173 | next_port: |
173 | /* OK, here is the one we will use. HEAD is | 174 | spin_unlock_bh(&head->lock); |
174 | * non-NULL and we hold it's mutex. | 175 | cond_resched(); |
175 | */ | 176 | } |
176 | snum = rover; | 177 | |
177 | } else { | 178 | if (smallest_size != -1) { |
178 | have_snum: | 179 | port = smallest_port; |
179 | head = &hashinfo->bhash[inet_bhashfn(net, snum, | 180 | goto have_port; |
180 | hashinfo->bhash_size)]; | 181 | } |
181 | spin_lock(&head->lock); | 182 | offset--; |
182 | inet_bind_bucket_for_each(tb, &head->chain) | 183 | if (!(offset & 1)) |
183 | if (net_eq(ib_net(tb), net) && tb->port == snum) | 184 | goto other_parity_scan; |
184 | goto tb_found; | 185 | |
186 | if (attempt_half == 1) { | ||
187 | /* OK we now try the upper half of the range */ | ||
188 | attempt_half = 2; | ||
189 | goto other_half_scan; | ||
185 | } | 190 | } |
186 | tb = NULL; | 191 | return ret; |
187 | goto tb_not_found; | 192 | |
193 | tb_not_found: | ||
194 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, | ||
195 | net, head, port); | ||
196 | if (!tb) | ||
197 | goto fail_unlock; | ||
188 | tb_found: | 198 | tb_found: |
189 | if (!hlist_empty(&tb->owners)) { | 199 | if (!hlist_empty(&tb->owners)) { |
190 | if (sk->sk_reuse == SK_FORCE_REUSE) | 200 | if (sk->sk_reuse == SK_FORCE_REUSE) |
191 | goto success; | 201 | goto success; |
192 | 202 | ||
193 | if (((tb->fastreuse > 0 && | 203 | if (((tb->fastreuse > 0 && reuse) || |
194 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) || | ||
195 | (tb->fastreuseport > 0 && | 204 | (tb->fastreuseport > 0 && |
205 | !rcu_access_pointer(sk->sk_reuseport_cb) && | ||
196 | sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && | 206 | sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && |
197 | smallest_size == -1) { | 207 | smallest_size == -1) |
198 | goto success; | 208 | goto success; |
199 | } else { | 209 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { |
200 | ret = 1; | 210 | if ((reuse || |
201 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { | 211 | (tb->fastreuseport > 0 && |
202 | if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || | 212 | sk->sk_reuseport && |
203 | (tb->fastreuseport > 0 && | 213 | !rcu_access_pointer(sk->sk_reuseport_cb) && |
204 | sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && | 214 | uid_eq(tb->fastuid, uid))) && |
205 | smallest_size != -1 && --attempts >= 0) { | 215 | smallest_size != -1 && --attempts >= 0) { |
206 | spin_unlock(&head->lock); | 216 | spin_unlock_bh(&head->lock); |
207 | goto again; | 217 | goto again; |
208 | } | ||
209 | |||
210 | goto fail_unlock; | ||
211 | } | 218 | } |
219 | goto fail_unlock; | ||
212 | } | 220 | } |
213 | } | 221 | if (!reuse) |
214 | tb_not_found: | ||
215 | ret = 1; | ||
216 | if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, | ||
217 | net, head, snum)) == NULL) | ||
218 | goto fail_unlock; | ||
219 | if (hlist_empty(&tb->owners)) { | ||
220 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) | ||
221 | tb->fastreuse = 1; | ||
222 | else | ||
223 | tb->fastreuse = 0; | 222 | tb->fastreuse = 0; |
223 | if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) | ||
224 | tb->fastreuseport = 0; | ||
225 | } else { | ||
226 | tb->fastreuse = reuse; | ||
224 | if (sk->sk_reuseport) { | 227 | if (sk->sk_reuseport) { |
225 | tb->fastreuseport = 1; | 228 | tb->fastreuseport = 1; |
226 | tb->fastuid = uid; | 229 | tb->fastuid = uid; |
227 | } else | 230 | } else { |
228 | tb->fastreuseport = 0; | ||
229 | } else { | ||
230 | if (tb->fastreuse && | ||
231 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
232 | tb->fastreuse = 0; | ||
233 | if (tb->fastreuseport && | ||
234 | (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) | ||
235 | tb->fastreuseport = 0; | 231 | tb->fastreuseport = 0; |
232 | } | ||
236 | } | 233 | } |
237 | success: | 234 | success: |
238 | if (!inet_csk(sk)->icsk_bind_hash) | 235 | if (!inet_csk(sk)->icsk_bind_hash) |
239 | inet_bind_hash(sk, tb, snum); | 236 | inet_bind_hash(sk, tb, port); |
240 | WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); | 237 | WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); |
241 | ret = 0; | 238 | ret = 0; |
242 | 239 | ||
243 | fail_unlock: | 240 | fail_unlock: |
244 | spin_unlock(&head->lock); | 241 | spin_unlock_bh(&head->lock); |
245 | fail: | ||
246 | local_bh_enable(); | ||
247 | return ret; | 242 | return ret; |
248 | } | 243 | } |
249 | EXPORT_SYMBOL_GPL(inet_csk_get_port); | 244 | EXPORT_SYMBOL_GPL(inet_csk_get_port); |
@@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); | |||
482 | #define AF_INET_FAMILY(fam) true | 477 | #define AF_INET_FAMILY(fam) true |
483 | #endif | 478 | #endif |
484 | 479 | ||
485 | /* Only thing we need from tcp.h */ | ||
486 | extern int sysctl_tcp_synack_retries; | ||
487 | |||
488 | |||
489 | /* Decide when to expire the request and when to resend SYN-ACK */ | 480 | /* Decide when to expire the request and when to resend SYN-ACK */ |
490 | static inline void syn_ack_recalc(struct request_sock *req, const int thresh, | 481 | static inline void syn_ack_recalc(struct request_sock *req, const int thresh, |
491 | const int max_retries, | 482 | const int max_retries, |
@@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data) | |||
557 | { | 548 | { |
558 | struct request_sock *req = (struct request_sock *)data; | 549 | struct request_sock *req = (struct request_sock *)data; |
559 | struct sock *sk_listener = req->rsk_listener; | 550 | struct sock *sk_listener = req->rsk_listener; |
551 | struct net *net = sock_net(sk_listener); | ||
560 | struct inet_connection_sock *icsk = inet_csk(sk_listener); | 552 | struct inet_connection_sock *icsk = inet_csk(sk_listener); |
561 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | 553 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; |
562 | int qlen, expire = 0, resend = 0; | 554 | int qlen, expire = 0, resend = 0; |
@@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data) | |||
566 | if (sk_state_load(sk_listener) != TCP_LISTEN) | 558 | if (sk_state_load(sk_listener) != TCP_LISTEN) |
567 | goto drop; | 559 | goto drop; |
568 | 560 | ||
569 | max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; | 561 | max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; |
570 | thresh = max_retries; | 562 | thresh = max_retries; |
571 | /* Normally all the openreqs are young and become mature | 563 | /* Normally all the openreqs are young and become mature |
572 | * (i.e. converted to established socket) for first timeout. | 564 | * (i.e. converted to established socket) for first timeout. |
@@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) | |||
737 | { | 729 | { |
738 | struct inet_connection_sock *icsk = inet_csk(sk); | 730 | struct inet_connection_sock *icsk = inet_csk(sk); |
739 | struct inet_sock *inet = inet_sk(sk); | 731 | struct inet_sock *inet = inet_sk(sk); |
732 | int err = -EADDRINUSE; | ||
740 | 733 | ||
741 | reqsk_queue_alloc(&icsk->icsk_accept_queue); | 734 | reqsk_queue_alloc(&icsk->icsk_accept_queue); |
742 | 735 | ||
@@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) | |||
754 | inet->inet_sport = htons(inet->inet_num); | 747 | inet->inet_sport = htons(inet->inet_num); |
755 | 748 | ||
756 | sk_dst_reset(sk); | 749 | sk_dst_reset(sk); |
757 | sk->sk_prot->hash(sk); | 750 | err = sk->sk_prot->hash(sk); |
758 | 751 | ||
759 | return 0; | 752 | if (likely(!err)) |
753 | return 0; | ||
760 | } | 754 | } |
761 | 755 | ||
762 | sk->sk_state = TCP_CLOSE; | 756 | sk->sk_state = TCP_CLOSE; |
763 | return -EADDRINUSE; | 757 | return err; |
764 | } | 758 | } |
765 | EXPORT_SYMBOL_GPL(inet_csk_listen_start); | 759 | EXPORT_SYMBOL_GPL(inet_csk_listen_start); |
766 | 760 | ||
@@ -789,14 +783,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, | |||
789 | reqsk_put(req); | 783 | reqsk_put(req); |
790 | } | 784 | } |
791 | 785 | ||
792 | void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, | 786 | struct sock *inet_csk_reqsk_queue_add(struct sock *sk, |
793 | struct sock *child) | 787 | struct request_sock *req, |
788 | struct sock *child) | ||
794 | { | 789 | { |
795 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; | 790 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
796 | 791 | ||
797 | spin_lock(&queue->rskq_lock); | 792 | spin_lock(&queue->rskq_lock); |
798 | if (unlikely(sk->sk_state != TCP_LISTEN)) { | 793 | if (unlikely(sk->sk_state != TCP_LISTEN)) { |
799 | inet_child_forget(sk, req, child); | 794 | inet_child_forget(sk, req, child); |
795 | child = NULL; | ||
800 | } else { | 796 | } else { |
801 | req->sk = child; | 797 | req->sk = child; |
802 | req->dl_next = NULL; | 798 | req->dl_next = NULL; |
@@ -808,6 +804,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, | |||
808 | sk_acceptq_added(sk); | 804 | sk_acceptq_added(sk); |
809 | } | 805 | } |
810 | spin_unlock(&queue->rskq_lock); | 806 | spin_unlock(&queue->rskq_lock); |
807 | return child; | ||
811 | } | 808 | } |
812 | EXPORT_SYMBOL(inet_csk_reqsk_queue_add); | 809 | EXPORT_SYMBOL(inet_csk_reqsk_queue_add); |
813 | 810 | ||
@@ -817,11 +814,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, | |||
817 | if (own_req) { | 814 | if (own_req) { |
818 | inet_csk_reqsk_queue_drop(sk, req); | 815 | inet_csk_reqsk_queue_drop(sk, req); |
819 | reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); | 816 | reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); |
820 | inet_csk_reqsk_queue_add(sk, req, child); | 817 | if (inet_csk_reqsk_queue_add(sk, req, child)) |
821 | /* Warning: caller must not call reqsk_put(req); | 818 | return child; |
822 | * child stole last reference on it. | ||
823 | */ | ||
824 | return child; | ||
825 | } | 819 | } |
826 | /* Too bad, another child took ownership of the request, undo. */ | 820 | /* Too bad, another child took ownership of the request, undo. */ |
827 | bh_unlock_sock(child); | 821 | bh_unlock_sock(child); |