diff options
author | Eric Dumazet <edumazet@google.com> | 2016-02-11 19:28:50 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-02-12 05:28:32 -0500 |
commit | ea8add2b190395408b22a9127bed2c0912aecbc8 (patch) | |
tree | 8b75745c913c33eba2707c3dc5fb0e312bfe3387 /net/ipv4/inet_connection_sock.c | |
parent | 1580ab63fc9a03593072cc5656167a75c4f1d173 (diff) |
tcp/dccp: better use of ephemeral ports in bind()
Implement strategy used in __inet_hash_connect() in opposite way :
Try to find a candidate using odd ports, then fallback to even ports.
We no longer disable BH for whole traversal, but one bucket at a time.
We also use cond_resched() to yield cpu to other tasks if needed.
I removed one indentation level and tried to mirror the loop we have
in __inet_hash_connect() and variable names to ease code maintenance.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 240 |
1 files changed, 114 insertions, 126 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c16a2e6273d9..3d28c6d5c3c3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); | |||
91 | 91 | ||
92 | /* Obtain a reference to a local port for the given sock, | 92 | /* Obtain a reference to a local port for the given sock, |
93 | * if snum is zero it means select any available local port. | 93 | * if snum is zero it means select any available local port. |
94 | * We try to allocate an odd port (and leave even ports for connect()) | ||
94 | */ | 95 | */ |
95 | int inet_csk_get_port(struct sock *sk, unsigned short snum) | 96 | int inet_csk_get_port(struct sock *sk, unsigned short snum) |
96 | { | 97 | { |
97 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 98 | bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; |
99 | struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; | ||
100 | int ret = 1, attempts = 5, port = snum; | ||
101 | int smallest_size = -1, smallest_port; | ||
98 | struct inet_bind_hashbucket *head; | 102 | struct inet_bind_hashbucket *head; |
99 | struct inet_bind_bucket *tb; | ||
100 | int ret, attempts = 5; | ||
101 | struct net *net = sock_net(sk); | 103 | struct net *net = sock_net(sk); |
102 | int smallest_size = -1, smallest_rover; | 104 | int i, low, high, attempt_half; |
105 | struct inet_bind_bucket *tb; | ||
103 | kuid_t uid = sock_i_uid(sk); | 106 | kuid_t uid = sock_i_uid(sk); |
104 | int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; | 107 | u32 remaining, offset; |
105 | 108 | ||
106 | local_bh_disable(); | 109 | if (port) { |
107 | if (!snum) { | 110 | have_port: |
108 | int remaining, rover, low, high; | 111 | head = &hinfo->bhash[inet_bhashfn(net, port, |
112 | hinfo->bhash_size)]; | ||
113 | spin_lock_bh(&head->lock); | ||
114 | inet_bind_bucket_for_each(tb, &head->chain) | ||
115 | if (net_eq(ib_net(tb), net) && tb->port == port) | ||
116 | goto tb_found; | ||
109 | 117 | ||
118 | goto tb_not_found; | ||
119 | } | ||
110 | again: | 120 | again: |
111 | inet_get_local_port_range(net, &low, &high); | 121 | attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; |
112 | if (attempt_half) { | 122 | other_half_scan: |
113 | int half = low + ((high - low) >> 1); | 123 | inet_get_local_port_range(net, &low, &high); |
114 | 124 | high++; /* [32768, 60999] -> [32768, 61000[ */ | |
115 | if (attempt_half == 1) | 125 | if (high - low < 4) |
116 | high = half; | 126 | attempt_half = 0; |
117 | else | 127 | if (attempt_half) { |
118 | low = half; | 128 | int half = low + (((high - low) >> 2) << 1); |
119 | } | 129 | |
120 | remaining = (high - low) + 1; | 130 | if (attempt_half == 1) |
121 | smallest_rover = rover = prandom_u32() % remaining + low; | 131 | high = half; |
122 | 132 | else | |
123 | smallest_size = -1; | 133 | low = half; |
124 | do { | 134 | } |
125 | if (inet_is_local_reserved_port(net, rover)) | 135 | remaining = high - low; |
126 | goto next_nolock; | 136 | if (likely(remaining > 1)) |
127 | head = &hashinfo->bhash[inet_bhashfn(net, rover, | 137 | remaining &= ~1U; |
128 | hashinfo->bhash_size)]; | 138 | |
129 | spin_lock(&head->lock); | 139 | offset = prandom_u32() % remaining; |
130 | inet_bind_bucket_for_each(tb, &head->chain) | 140 | /* __inet_hash_connect() favors ports having @low parity |
131 | if (net_eq(ib_net(tb), net) && tb->port == rover) { | 141 | * We do the opposite to not pollute connect() users. |
132 | if (((tb->fastreuse > 0 && | 142 | */ |
133 | sk->sk_reuse && | 143 | offset |= 1U; |
134 | sk->sk_state != TCP_LISTEN) || | 144 | smallest_size = -1; |
135 | (tb->fastreuseport > 0 && | 145 | smallest_port = low; /* avoid compiler warning */ |
136 | sk->sk_reuseport && | 146 | |
137 | !rcu_access_pointer(sk->sk_reuseport_cb) && | 147 | other_parity_scan: |
138 | uid_eq(tb->fastuid, uid))) && | 148 | port = low + offset; |
139 | (tb->num_owners < smallest_size || smallest_size == -1)) { | 149 | for (i = 0; i < remaining; i += 2, port += 2) { |
140 | smallest_size = tb->num_owners; | 150 | if (unlikely(port >= high)) |
141 | smallest_rover = rover; | 151 | port -= remaining; |
142 | } | 152 | if (inet_is_local_reserved_port(net, port)) |
143 | if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { | 153 | continue; |
144 | snum = rover; | 154 | head = &hinfo->bhash[inet_bhashfn(net, port, |
145 | goto tb_found; | 155 | hinfo->bhash_size)]; |
146 | } | 156 | spin_lock_bh(&head->lock); |
147 | goto next; | 157 | inet_bind_bucket_for_each(tb, &head->chain) |
158 | if (net_eq(ib_net(tb), net) && tb->port == port) { | ||
159 | if (((tb->fastreuse > 0 && reuse) || | ||
160 | (tb->fastreuseport > 0 && | ||
161 | sk->sk_reuseport && | ||
162 | !rcu_access_pointer(sk->sk_reuseport_cb) && | ||
163 | uid_eq(tb->fastuid, uid))) && | ||
164 | (tb->num_owners < smallest_size || smallest_size == -1)) { | ||
165 | smallest_size = tb->num_owners; | ||
166 | smallest_port = port; | ||
148 | } | 167 | } |
149 | break; | 168 | if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) |
150 | next: | 169 | goto tb_found; |
151 | spin_unlock(&head->lock); | 170 | goto next_port; |
152 | next_nolock: | ||
153 | if (++rover > high) | ||
154 | rover = low; | ||
155 | } while (--remaining > 0); | ||
156 | |||
157 | /* Exhausted local port range during search? It is not | ||
158 | * possible for us to be holding one of the bind hash | ||
159 | * locks if this test triggers, because if 'remaining' | ||
160 | * drops to zero, we broke out of the do/while loop at | ||
161 | * the top level, not from the 'break;' statement. | ||
162 | */ | ||
163 | ret = 1; | ||
164 | if (remaining <= 0) { | ||
165 | if (smallest_size != -1) { | ||
166 | snum = smallest_rover; | ||
167 | goto have_snum; | ||
168 | } | 171 | } |
169 | if (attempt_half == 1) { | 172 | goto tb_not_found; |
170 | /* OK we now try the upper half of the range */ | 173 | next_port: |
171 | attempt_half = 2; | 174 | spin_unlock_bh(&head->lock); |
172 | goto again; | 175 | cond_resched(); |
173 | } | ||
174 | goto fail; | ||
175 | } | ||
176 | /* OK, here is the one we will use. HEAD is | ||
177 | * non-NULL and we hold it's mutex. | ||
178 | */ | ||
179 | snum = rover; | ||
180 | } else { | ||
181 | have_snum: | ||
182 | head = &hashinfo->bhash[inet_bhashfn(net, snum, | ||
183 | hashinfo->bhash_size)]; | ||
184 | spin_lock(&head->lock); | ||
185 | inet_bind_bucket_for_each(tb, &head->chain) | ||
186 | if (net_eq(ib_net(tb), net) && tb->port == snum) | ||
187 | goto tb_found; | ||
188 | } | 176 | } |
189 | tb = NULL; | 177 | |
190 | goto tb_not_found; | 178 | if (smallest_size != -1) { |
179 | port = smallest_port; | ||
180 | goto have_port; | ||
181 | } | ||
182 | offset--; | ||
183 | if (!(offset & 1)) | ||
184 | goto other_parity_scan; | ||
185 | |||
186 | if (attempt_half == 1) { | ||
187 | /* OK we now try the upper half of the range */ | ||
188 | attempt_half = 2; | ||
189 | goto other_half_scan; | ||
190 | } | ||
191 | return ret; | ||
192 | |||
193 | tb_not_found: | ||
194 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, | ||
195 | net, head, port); | ||
196 | if (!tb) | ||
197 | goto fail_unlock; | ||
191 | tb_found: | 198 | tb_found: |
192 | if (!hlist_empty(&tb->owners)) { | 199 | if (!hlist_empty(&tb->owners)) { |
193 | if (sk->sk_reuse == SK_FORCE_REUSE) | 200 | if (sk->sk_reuse == SK_FORCE_REUSE) |
194 | goto success; | 201 | goto success; |
195 | 202 | ||
196 | if (((tb->fastreuse > 0 && | 203 | if (((tb->fastreuse > 0 && reuse) || |
197 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) || | ||
198 | (tb->fastreuseport > 0 && | 204 | (tb->fastreuseport > 0 && |
199 | sk->sk_reuseport && | 205 | sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && |
200 | !rcu_access_pointer(sk->sk_reuseport_cb) && | 206 | smallest_size == -1) |
201 | uid_eq(tb->fastuid, uid))) && smallest_size == -1) { | ||
202 | goto success; | 207 | goto success; |
203 | } else { | 208 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { |
204 | ret = 1; | 209 | if ((reuse || |
205 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { | 210 | (tb->fastreuseport > 0 && |
206 | if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || | 211 | sk->sk_reuseport && |
207 | (tb->fastreuseport > 0 && | 212 | !rcu_access_pointer(sk->sk_reuseport_cb) && |
208 | sk->sk_reuseport && | 213 | uid_eq(tb->fastuid, uid))) && |
209 | !rcu_access_pointer(sk->sk_reuseport_cb) && | 214 | smallest_size != -1 && --attempts >= 0) { |
210 | uid_eq(tb->fastuid, uid))) && | 215 | spin_unlock_bh(&head->lock); |
211 | smallest_size != -1 && --attempts >= 0) { | 216 | goto again; |
212 | spin_unlock(&head->lock); | ||
213 | goto again; | ||
214 | } | ||
215 | |||
216 | goto fail_unlock; | ||
217 | } | 217 | } |
218 | goto fail_unlock; | ||
218 | } | 219 | } |
219 | } | 220 | if (!reuse) |
220 | tb_not_found: | ||
221 | ret = 1; | ||
222 | if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, | ||
223 | net, head, snum)) == NULL) | ||
224 | goto fail_unlock; | ||
225 | if (hlist_empty(&tb->owners)) { | ||
226 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) | ||
227 | tb->fastreuse = 1; | ||
228 | else | ||
229 | tb->fastreuse = 0; | 221 | tb->fastreuse = 0; |
222 | if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) | ||
223 | tb->fastreuseport = 0; | ||
224 | } else { | ||
225 | tb->fastreuse = reuse; | ||
230 | if (sk->sk_reuseport) { | 226 | if (sk->sk_reuseport) { |
231 | tb->fastreuseport = 1; | 227 | tb->fastreuseport = 1; |
232 | tb->fastuid = uid; | 228 | tb->fastuid = uid; |
233 | } else | 229 | } else { |
234 | tb->fastreuseport = 0; | ||
235 | } else { | ||
236 | if (tb->fastreuse && | ||
237 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
238 | tb->fastreuse = 0; | ||
239 | if (tb->fastreuseport && | ||
240 | (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) | ||
241 | tb->fastreuseport = 0; | 230 | tb->fastreuseport = 0; |
231 | } | ||
242 | } | 232 | } |
243 | success: | 233 | success: |
244 | if (!inet_csk(sk)->icsk_bind_hash) | 234 | if (!inet_csk(sk)->icsk_bind_hash) |
245 | inet_bind_hash(sk, tb, snum); | 235 | inet_bind_hash(sk, tb, port); |
246 | WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); | 236 | WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); |
247 | ret = 0; | 237 | ret = 0; |
248 | 238 | ||
249 | fail_unlock: | 239 | fail_unlock: |
250 | spin_unlock(&head->lock); | 240 | spin_unlock_bh(&head->lock); |
251 | fail: | ||
252 | local_bh_enable(); | ||
253 | return ret; | 241 | return ret; |
254 | } | 242 | } |
255 | EXPORT_SYMBOL_GPL(inet_csk_get_port); | 243 | EXPORT_SYMBOL_GPL(inet_csk_get_port); |