diff options
author | Andy Grover <andy.grover@oracle.com> | 2009-08-21 08:28:31 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-08-23 22:13:02 -0400 |
commit | 70041088e3b976627ba9a183b812f39ef8a9ba0e (patch) | |
tree | bad7b11763d7b02b185bd705fe5ed292397cbc7a /net/rds/tcp_recv.c | |
parent | 7d6fd5e7e97a2188d56441e4e96494c21c5994a7 (diff) |
RDS: Add TCP transport to RDS
This code allows RDS to be tunneled over a TCP connection.
RDMA operations are disabled when using TCP transport,
but this frees RDS from the IB/RDMA stack dependency, and allows
it to be used with standard Ethernet adapters, or in a VM.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds/tcp_recv.c')
-rw-r--r-- | net/rds/tcp_recv.c | 356 |
1 files changed, 356 insertions, 0 deletions
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c new file mode 100644 index 000000000000..c00dafffbb5a --- /dev/null +++ b/net/rds/tcp_recv.c | |||
@@ -0,0 +1,356 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <net/tcp.h> | ||
35 | |||
36 | #include "rds.h" | ||
37 | #include "tcp.h" | ||
38 | |||
39 | static struct kmem_cache *rds_tcp_incoming_slab; | ||
40 | |||
41 | void rds_tcp_inc_purge(struct rds_incoming *inc) | ||
42 | { | ||
43 | struct rds_tcp_incoming *tinc; | ||
44 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
45 | rdsdebug("purging tinc %p inc %p\n", tinc, inc); | ||
46 | skb_queue_purge(&tinc->ti_skb_list); | ||
47 | } | ||
48 | |||
49 | void rds_tcp_inc_free(struct rds_incoming *inc) | ||
50 | { | ||
51 | struct rds_tcp_incoming *tinc; | ||
52 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
53 | rds_tcp_inc_purge(inc); | ||
54 | rdsdebug("freeing tinc %p inc %p\n", tinc, inc); | ||
55 | kmem_cache_free(rds_tcp_incoming_slab, tinc); | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * this is pretty lame, but, whatever. | ||
60 | */ | ||
61 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | ||
62 | size_t size) | ||
63 | { | ||
64 | struct rds_tcp_incoming *tinc; | ||
65 | struct iovec *iov, tmp; | ||
66 | struct sk_buff *skb; | ||
67 | unsigned long to_copy, skb_off; | ||
68 | int ret = 0; | ||
69 | |||
70 | if (size == 0) | ||
71 | goto out; | ||
72 | |||
73 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
74 | iov = first_iov; | ||
75 | tmp = *iov; | ||
76 | |||
77 | skb_queue_walk(&tinc->ti_skb_list, skb) { | ||
78 | skb_off = 0; | ||
79 | while (skb_off < skb->len) { | ||
80 | while (tmp.iov_len == 0) { | ||
81 | iov++; | ||
82 | tmp = *iov; | ||
83 | } | ||
84 | |||
85 | to_copy = min(tmp.iov_len, size); | ||
86 | to_copy = min(to_copy, skb->len - skb_off); | ||
87 | |||
88 | rdsdebug("ret %d size %zu skb %p skb_off %lu " | ||
89 | "skblen %d iov_base %p iov_len %zu cpy %lu\n", | ||
90 | ret, size, skb, skb_off, skb->len, | ||
91 | tmp.iov_base, tmp.iov_len, to_copy); | ||
92 | |||
93 | /* modifies tmp as it copies */ | ||
94 | if (skb_copy_datagram_iovec(skb, skb_off, &tmp, | ||
95 | to_copy)) { | ||
96 | ret = -EFAULT; | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | size -= to_copy; | ||
101 | ret += to_copy; | ||
102 | skb_off += to_copy; | ||
103 | if (size == 0) | ||
104 | goto out; | ||
105 | } | ||
106 | } | ||
107 | out: | ||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * We have a series of skbs that have fragmented pieces of the congestion | ||
113 | * bitmap. They must add up to the exact size of the congestion bitmap. We | ||
114 | * use the skb helpers to copy those into the pages that make up the in-memory | ||
115 | * congestion bitmap for the remote address of this connection. We then tell | ||
116 | * the congestion core that the bitmap has been changed so that it can wake up | ||
117 | * sleepers. | ||
118 | * | ||
119 | * This is racing with sending paths which are using test_bit to see if the | ||
120 | * bitmap indicates that their recipient is congested. | ||
121 | */ | ||
122 | |||
123 | static void rds_tcp_cong_recv(struct rds_connection *conn, | ||
124 | struct rds_tcp_incoming *tinc) | ||
125 | { | ||
126 | struct sk_buff *skb; | ||
127 | unsigned int to_copy, skb_off; | ||
128 | unsigned int map_off; | ||
129 | unsigned int map_page; | ||
130 | struct rds_cong_map *map; | ||
131 | int ret; | ||
132 | |||
133 | /* catch completely corrupt packets */ | ||
134 | if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) | ||
135 | return; | ||
136 | |||
137 | map_page = 0; | ||
138 | map_off = 0; | ||
139 | map = conn->c_fcong; | ||
140 | |||
141 | skb_queue_walk(&tinc->ti_skb_list, skb) { | ||
142 | skb_off = 0; | ||
143 | while (skb_off < skb->len) { | ||
144 | to_copy = min_t(unsigned int, PAGE_SIZE - map_off, | ||
145 | skb->len - skb_off); | ||
146 | |||
147 | BUG_ON(map_page >= RDS_CONG_MAP_PAGES); | ||
148 | |||
149 | /* only returns 0 or -error */ | ||
150 | ret = skb_copy_bits(skb, skb_off, | ||
151 | (void *)map->m_page_addrs[map_page] + map_off, | ||
152 | to_copy); | ||
153 | BUG_ON(ret != 0); | ||
154 | |||
155 | skb_off += to_copy; | ||
156 | map_off += to_copy; | ||
157 | if (map_off == PAGE_SIZE) { | ||
158 | map_off = 0; | ||
159 | map_page++; | ||
160 | } | ||
161 | } | ||
162 | } | ||
163 | |||
164 | rds_cong_map_updated(map, ~(u64) 0); | ||
165 | } | ||
166 | |||
167 | struct rds_tcp_desc_arg { | ||
168 | struct rds_connection *conn; | ||
169 | gfp_t gfp; | ||
170 | enum km_type km; | ||
171 | }; | ||
172 | |||
173 | static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | ||
174 | unsigned int offset, size_t len) | ||
175 | { | ||
176 | struct rds_tcp_desc_arg *arg = desc->arg.data; | ||
177 | struct rds_connection *conn = arg->conn; | ||
178 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
179 | struct rds_tcp_incoming *tinc = tc->t_tinc; | ||
180 | struct sk_buff *clone; | ||
181 | size_t left = len, to_copy; | ||
182 | |||
183 | rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, | ||
184 | len); | ||
185 | |||
186 | /* | ||
187 | * tcp_read_sock() interprets partial progress as an indication to stop | ||
188 | * processing. | ||
189 | */ | ||
190 | while (left) { | ||
191 | if (tinc == NULL) { | ||
192 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, | ||
193 | arg->gfp); | ||
194 | if (tinc == NULL) { | ||
195 | desc->error = -ENOMEM; | ||
196 | goto out; | ||
197 | } | ||
198 | tc->t_tinc = tinc; | ||
199 | rdsdebug("alloced tinc %p\n", tinc); | ||
200 | rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); | ||
201 | /* | ||
202 | * XXX * we might be able to use the __ variants when | ||
203 | * we've already serialized at a higher level. | ||
204 | */ | ||
205 | skb_queue_head_init(&tinc->ti_skb_list); | ||
206 | } | ||
207 | |||
208 | if (left && tc->t_tinc_hdr_rem) { | ||
209 | to_copy = min(tc->t_tinc_hdr_rem, left); | ||
210 | rdsdebug("copying %zu header from skb %p\n", to_copy, | ||
211 | skb); | ||
212 | skb_copy_bits(skb, offset, | ||
213 | (char *)&tinc->ti_inc.i_hdr + | ||
214 | sizeof(struct rds_header) - | ||
215 | tc->t_tinc_hdr_rem, | ||
216 | to_copy); | ||
217 | tc->t_tinc_hdr_rem -= to_copy; | ||
218 | left -= to_copy; | ||
219 | offset += to_copy; | ||
220 | |||
221 | if (tc->t_tinc_hdr_rem == 0) { | ||
222 | /* could be 0 for a 0 len message */ | ||
223 | tc->t_tinc_data_rem = | ||
224 | be32_to_cpu(tinc->ti_inc.i_hdr.h_len); | ||
225 | } | ||
226 | } | ||
227 | |||
228 | if (left && tc->t_tinc_data_rem) { | ||
229 | clone = skb_clone(skb, arg->gfp); | ||
230 | if (clone == NULL) { | ||
231 | desc->error = -ENOMEM; | ||
232 | goto out; | ||
233 | } | ||
234 | |||
235 | to_copy = min(tc->t_tinc_data_rem, left); | ||
236 | pskb_pull(clone, offset); | ||
237 | pskb_trim(clone, to_copy); | ||
238 | skb_queue_tail(&tinc->ti_skb_list, clone); | ||
239 | |||
240 | rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " | ||
241 | "clone %p data %p len %d\n", | ||
242 | skb, skb->data, skb->len, offset, to_copy, | ||
243 | clone, clone->data, clone->len); | ||
244 | |||
245 | tc->t_tinc_data_rem -= to_copy; | ||
246 | left -= to_copy; | ||
247 | offset += to_copy; | ||
248 | } | ||
249 | |||
250 | if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { | ||
251 | if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) | ||
252 | rds_tcp_cong_recv(conn, tinc); | ||
253 | else | ||
254 | rds_recv_incoming(conn, conn->c_faddr, | ||
255 | conn->c_laddr, &tinc->ti_inc, | ||
256 | arg->gfp, arg->km); | ||
257 | |||
258 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
259 | tc->t_tinc_data_rem = 0; | ||
260 | tc->t_tinc = NULL; | ||
261 | rds_inc_put(&tinc->ti_inc); | ||
262 | tinc = NULL; | ||
263 | } | ||
264 | } | ||
265 | out: | ||
266 | rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", | ||
267 | len, left, skb->len, | ||
268 | skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); | ||
269 | return len - left; | ||
270 | } | ||
271 | |||
272 | /* the caller has to hold the sock lock */ | ||
273 | int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) | ||
274 | { | ||
275 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
276 | struct socket *sock = tc->t_sock; | ||
277 | read_descriptor_t desc; | ||
278 | struct rds_tcp_desc_arg arg; | ||
279 | |||
280 | /* It's like glib in the kernel! */ | ||
281 | arg.conn = conn; | ||
282 | arg.gfp = gfp; | ||
283 | arg.km = km; | ||
284 | desc.arg.data = &arg; | ||
285 | desc.error = 0; | ||
286 | desc.count = 1; /* give more than one skb per call */ | ||
287 | |||
288 | tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); | ||
289 | rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, | ||
290 | desc.error); | ||
291 | |||
292 | return desc.error; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from | ||
297 | * data_ready. | ||
298 | * | ||
299 | * if we fail to allocate we're in trouble.. blindly wait some time before | ||
300 | * trying again to see if the VM can free up something for us. | ||
301 | */ | ||
302 | int rds_tcp_recv(struct rds_connection *conn) | ||
303 | { | ||
304 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
305 | struct socket *sock = tc->t_sock; | ||
306 | int ret = 0; | ||
307 | |||
308 | rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); | ||
309 | |||
310 | lock_sock(sock->sk); | ||
311 | ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); | ||
312 | release_sock(sock->sk); | ||
313 | |||
314 | return ret; | ||
315 | } | ||
316 | |||
317 | void rds_tcp_data_ready(struct sock *sk, int bytes) | ||
318 | { | ||
319 | void (*ready)(struct sock *sk, int bytes); | ||
320 | struct rds_connection *conn; | ||
321 | struct rds_tcp_connection *tc; | ||
322 | |||
323 | rdsdebug("data ready sk %p bytes %d\n", sk, bytes); | ||
324 | |||
325 | read_lock(&sk->sk_callback_lock); | ||
326 | conn = sk->sk_user_data; | ||
327 | if (conn == NULL) { /* check for teardown race */ | ||
328 | ready = sk->sk_data_ready; | ||
329 | goto out; | ||
330 | } | ||
331 | |||
332 | tc = conn->c_transport_data; | ||
333 | ready = tc->t_orig_data_ready; | ||
334 | rds_tcp_stats_inc(s_tcp_data_ready_calls); | ||
335 | |||
336 | if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) | ||
337 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
338 | out: | ||
339 | read_unlock(&sk->sk_callback_lock); | ||
340 | ready(sk, bytes); | ||
341 | } | ||
342 | |||
343 | int __init rds_tcp_recv_init(void) | ||
344 | { | ||
345 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", | ||
346 | sizeof(struct rds_tcp_incoming), | ||
347 | 0, 0, NULL); | ||
348 | if (rds_tcp_incoming_slab == NULL) | ||
349 | return -ENOMEM; | ||
350 | return 0; | ||
351 | } | ||
352 | |||
353 | void rds_tcp_recv_exit(void) | ||
354 | { | ||
355 | kmem_cache_destroy(rds_tcp_incoming_slab); | ||
356 | } | ||