diff options
author | Andy Grover <andy.grover@oracle.com> | 2009-08-21 08:28:31 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-08-23 22:13:02 -0400 |
commit | 70041088e3b976627ba9a183b812f39ef8a9ba0e (patch) | |
tree | bad7b11763d7b02b185bd705fe5ed292397cbc7a /net/rds | |
parent | 7d6fd5e7e97a2188d56441e4e96494c21c5994a7 (diff) |
RDS: Add TCP transport to RDS
This code allows RDS to be tunneled over a TCP connection.
RDMA operations are disabled when using TCP transport,
but this frees RDS from the IB/RDMA stack dependency, and allows
it to be used with standard Ethernet adapters, or in a VM.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds')
-rw-r--r-- | net/rds/tcp.c | 319 | ||||
-rw-r--r-- | net/rds/tcp.h | 93 | ||||
-rw-r--r-- | net/rds/tcp_connect.c | 153 | ||||
-rw-r--r-- | net/rds/tcp_listen.c | 199 | ||||
-rw-r--r-- | net/rds/tcp_recv.c | 356 | ||||
-rw-r--r-- | net/rds/tcp_send.c | 263 | ||||
-rw-r--r-- | net/rds/tcp_stats.c | 74 |
7 files changed, 1457 insertions, 0 deletions
diff --git a/net/rds/tcp.c b/net/rds/tcp.c new file mode 100644 index 000000000000..e0ac9009db1a --- /dev/null +++ b/net/rds/tcp.c | |||
@@ -0,0 +1,319 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <net/tcp.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "tcp.h" | ||
39 | |||
40 | /* only for info exporting */ | ||
41 | static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); | ||
42 | static LIST_HEAD(rds_tcp_tc_list); | ||
43 | unsigned int rds_tcp_tc_count; | ||
44 | |||
45 | /* Track rds_tcp_connection structs so they can be cleaned up */ | ||
46 | static DEFINE_SPINLOCK(rds_tcp_conn_lock); | ||
47 | static LIST_HEAD(rds_tcp_conn_list); | ||
48 | |||
49 | static struct kmem_cache *rds_tcp_conn_slab; | ||
50 | |||
51 | #define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) | ||
52 | |||
53 | /* doing it this way avoids calling tcp_sk() */ | ||
54 | void rds_tcp_nonagle(struct socket *sock) | ||
55 | { | ||
56 | mm_segment_t oldfs = get_fs(); | ||
57 | int val = 1; | ||
58 | |||
59 | set_fs(KERNEL_DS); | ||
60 | sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, | ||
61 | sizeof(val)); | ||
62 | set_fs(oldfs); | ||
63 | } | ||
64 | |||
65 | void rds_tcp_tune(struct socket *sock) | ||
66 | { | ||
67 | struct sock *sk = sock->sk; | ||
68 | |||
69 | rds_tcp_nonagle(sock); | ||
70 | |||
71 | /* | ||
72 | * We're trying to saturate gigabit with the default, | ||
73 | * see svc_sock_setbufsize(). | ||
74 | */ | ||
75 | lock_sock(sk); | ||
76 | sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; | ||
77 | sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; | ||
78 | sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; | ||
79 | release_sock(sk); | ||
80 | } | ||
81 | |||
82 | u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) | ||
83 | { | ||
84 | return tcp_sk(tc->t_sock->sk)->snd_nxt; | ||
85 | } | ||
86 | |||
87 | u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) | ||
88 | { | ||
89 | return tcp_sk(tc->t_sock->sk)->snd_una; | ||
90 | } | ||
91 | |||
92 | void rds_tcp_restore_callbacks(struct socket *sock, | ||
93 | struct rds_tcp_connection *tc) | ||
94 | { | ||
95 | rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc); | ||
96 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
97 | |||
98 | /* done under the callback_lock to serialize with write_space */ | ||
99 | spin_lock(&rds_tcp_tc_list_lock); | ||
100 | list_del_init(&tc->t_list_item); | ||
101 | rds_tcp_tc_count--; | ||
102 | spin_unlock(&rds_tcp_tc_list_lock); | ||
103 | |||
104 | tc->t_sock = NULL; | ||
105 | |||
106 | sock->sk->sk_write_space = tc->t_orig_write_space; | ||
107 | sock->sk->sk_data_ready = tc->t_orig_data_ready; | ||
108 | sock->sk->sk_state_change = tc->t_orig_state_change; | ||
109 | sock->sk->sk_user_data = NULL; | ||
110 | |||
111 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * This is the only path that sets tc->t_sock. Send and receive trust that | ||
116 | * it is set. The RDS_CONN_CONNECTED bit protects those paths from being | ||
117 | * called while it isn't set. | ||
118 | */ | ||
119 | void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) | ||
120 | { | ||
121 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
122 | |||
123 | rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); | ||
124 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
125 | |||
126 | /* done under the callback_lock to serialize with write_space */ | ||
127 | spin_lock(&rds_tcp_tc_list_lock); | ||
128 | list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); | ||
129 | rds_tcp_tc_count++; | ||
130 | spin_unlock(&rds_tcp_tc_list_lock); | ||
131 | |||
132 | /* accepted sockets need our listen data ready undone */ | ||
133 | if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) | ||
134 | sock->sk->sk_data_ready = sock->sk->sk_user_data; | ||
135 | |||
136 | tc->t_sock = sock; | ||
137 | tc->conn = conn; | ||
138 | tc->t_orig_data_ready = sock->sk->sk_data_ready; | ||
139 | tc->t_orig_write_space = sock->sk->sk_write_space; | ||
140 | tc->t_orig_state_change = sock->sk->sk_state_change; | ||
141 | |||
142 | sock->sk->sk_user_data = conn; | ||
143 | sock->sk->sk_data_ready = rds_tcp_data_ready; | ||
144 | sock->sk->sk_write_space = rds_tcp_write_space; | ||
145 | sock->sk->sk_state_change = rds_tcp_state_change; | ||
146 | |||
147 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
148 | } | ||
149 | |||
150 | static void rds_tcp_tc_info(struct socket *sock, unsigned int len, | ||
151 | struct rds_info_iterator *iter, | ||
152 | struct rds_info_lengths *lens) | ||
153 | { | ||
154 | struct rds_info_tcp_socket tsinfo; | ||
155 | struct rds_tcp_connection *tc; | ||
156 | unsigned long flags; | ||
157 | struct sockaddr_in sin; | ||
158 | int sinlen; | ||
159 | |||
160 | spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); | ||
161 | |||
162 | if (len / sizeof(tsinfo) < rds_tcp_tc_count) | ||
163 | goto out; | ||
164 | |||
165 | list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { | ||
166 | |||
167 | sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); | ||
168 | tsinfo.local_addr = sin.sin_addr.s_addr; | ||
169 | tsinfo.local_port = sin.sin_port; | ||
170 | sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); | ||
171 | tsinfo.peer_addr = sin.sin_addr.s_addr; | ||
172 | tsinfo.peer_port = sin.sin_port; | ||
173 | |||
174 | tsinfo.hdr_rem = tc->t_tinc_hdr_rem; | ||
175 | tsinfo.data_rem = tc->t_tinc_data_rem; | ||
176 | tsinfo.last_sent_nxt = tc->t_last_sent_nxt; | ||
177 | tsinfo.last_expected_una = tc->t_last_expected_una; | ||
178 | tsinfo.last_seen_una = tc->t_last_seen_una; | ||
179 | |||
180 | rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); | ||
181 | } | ||
182 | |||
183 | out: | ||
184 | lens->nr = rds_tcp_tc_count; | ||
185 | lens->each = sizeof(tsinfo); | ||
186 | |||
187 | spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); | ||
188 | } | ||
189 | |||
190 | static int rds_tcp_laddr_check(__be32 addr) | ||
191 | { | ||
192 | if (inet_addr_type(&init_net, addr) == RTN_LOCAL) | ||
193 | return 0; | ||
194 | return -EADDRNOTAVAIL; | ||
195 | } | ||
196 | |||
197 | static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) | ||
198 | { | ||
199 | struct rds_tcp_connection *tc; | ||
200 | |||
201 | tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); | ||
202 | if (tc == NULL) | ||
203 | return -ENOMEM; | ||
204 | |||
205 | tc->t_sock = NULL; | ||
206 | tc->t_tinc = NULL; | ||
207 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
208 | tc->t_tinc_data_rem = 0; | ||
209 | |||
210 | conn->c_transport_data = tc; | ||
211 | |||
212 | spin_lock_irq(&rds_tcp_conn_lock); | ||
213 | list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); | ||
214 | spin_unlock_irq(&rds_tcp_conn_lock); | ||
215 | |||
216 | rdsdebug("alloced tc %p\n", conn->c_transport_data); | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | static void rds_tcp_conn_free(void *arg) | ||
221 | { | ||
222 | struct rds_tcp_connection *tc = arg; | ||
223 | rdsdebug("freeing tc %p\n", tc); | ||
224 | kmem_cache_free(rds_tcp_conn_slab, tc); | ||
225 | } | ||
226 | |||
227 | static void rds_tcp_destroy_conns(void) | ||
228 | { | ||
229 | struct rds_tcp_connection *tc, *_tc; | ||
230 | LIST_HEAD(tmp_list); | ||
231 | |||
232 | /* avoid calling conn_destroy with irqs off */ | ||
233 | spin_lock_irq(&rds_tcp_conn_lock); | ||
234 | list_splice(&rds_tcp_conn_list, &tmp_list); | ||
235 | INIT_LIST_HEAD(&rds_tcp_conn_list); | ||
236 | spin_unlock_irq(&rds_tcp_conn_lock); | ||
237 | |||
238 | list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { | ||
239 | if (tc->conn->c_passive) | ||
240 | rds_conn_destroy(tc->conn->c_passive); | ||
241 | rds_conn_destroy(tc->conn); | ||
242 | } | ||
243 | } | ||
244 | |||
245 | void rds_tcp_exit(void) | ||
246 | { | ||
247 | rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); | ||
248 | rds_tcp_listen_stop(); | ||
249 | rds_tcp_destroy_conns(); | ||
250 | rds_trans_unregister(&rds_tcp_transport); | ||
251 | rds_tcp_recv_exit(); | ||
252 | kmem_cache_destroy(rds_tcp_conn_slab); | ||
253 | } | ||
254 | module_exit(rds_tcp_exit); | ||
255 | |||
256 | struct rds_transport rds_tcp_transport = { | ||
257 | .laddr_check = rds_tcp_laddr_check, | ||
258 | .xmit_prepare = rds_tcp_xmit_prepare, | ||
259 | .xmit_complete = rds_tcp_xmit_complete, | ||
260 | .xmit_cong_map = rds_tcp_xmit_cong_map, | ||
261 | .xmit = rds_tcp_xmit, | ||
262 | .recv = rds_tcp_recv, | ||
263 | .conn_alloc = rds_tcp_conn_alloc, | ||
264 | .conn_free = rds_tcp_conn_free, | ||
265 | .conn_connect = rds_tcp_conn_connect, | ||
266 | .conn_shutdown = rds_tcp_conn_shutdown, | ||
267 | .inc_copy_to_user = rds_tcp_inc_copy_to_user, | ||
268 | .inc_purge = rds_tcp_inc_purge, | ||
269 | .inc_free = rds_tcp_inc_free, | ||
270 | .stats_info_copy = rds_tcp_stats_info_copy, | ||
271 | .exit = rds_tcp_exit, | ||
272 | .t_owner = THIS_MODULE, | ||
273 | .t_name = "tcp", | ||
274 | .t_prefer_loopback = 1, | ||
275 | }; | ||
276 | |||
277 | int __init rds_tcp_init(void) | ||
278 | { | ||
279 | int ret; | ||
280 | |||
281 | rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", | ||
282 | sizeof(struct rds_tcp_connection), | ||
283 | 0, 0, NULL); | ||
284 | if (rds_tcp_conn_slab == NULL) { | ||
285 | ret = -ENOMEM; | ||
286 | goto out; | ||
287 | } | ||
288 | |||
289 | ret = rds_tcp_recv_init(); | ||
290 | if (ret) | ||
291 | goto out_slab; | ||
292 | |||
293 | ret = rds_trans_register(&rds_tcp_transport); | ||
294 | if (ret) | ||
295 | goto out_recv; | ||
296 | |||
297 | ret = rds_tcp_listen_init(); | ||
298 | if (ret) | ||
299 | goto out_register; | ||
300 | |||
301 | rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); | ||
302 | |||
303 | goto out; | ||
304 | |||
305 | out_register: | ||
306 | rds_trans_unregister(&rds_tcp_transport); | ||
307 | out_recv: | ||
308 | rds_tcp_recv_exit(); | ||
309 | out_slab: | ||
310 | kmem_cache_destroy(rds_tcp_conn_slab); | ||
311 | out: | ||
312 | return ret; | ||
313 | } | ||
314 | module_init(rds_tcp_init); | ||
315 | |||
316 | MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); | ||
317 | MODULE_DESCRIPTION("RDS: TCP transport"); | ||
318 | MODULE_LICENSE("Dual BSD/GPL"); | ||
319 | |||
diff --git a/net/rds/tcp.h b/net/rds/tcp.h new file mode 100644 index 000000000000..844fa6b9cf5a --- /dev/null +++ b/net/rds/tcp.h | |||
@@ -0,0 +1,93 @@ | |||
1 | #ifndef _RDS_TCP_H | ||
2 | #define _RDS_TCP_H | ||
3 | |||
4 | #define RDS_TCP_PORT 16385 | ||
5 | |||
6 | struct rds_tcp_incoming { | ||
7 | struct rds_incoming ti_inc; | ||
8 | struct sk_buff_head ti_skb_list; | ||
9 | }; | ||
10 | |||
11 | struct rds_tcp_connection { | ||
12 | |||
13 | struct list_head t_tcp_node; | ||
14 | struct rds_connection *conn; | ||
15 | struct socket *t_sock; | ||
16 | void *t_orig_write_space; | ||
17 | void *t_orig_data_ready; | ||
18 | void *t_orig_state_change; | ||
19 | |||
20 | struct rds_tcp_incoming *t_tinc; | ||
21 | size_t t_tinc_hdr_rem; | ||
22 | size_t t_tinc_data_rem; | ||
23 | |||
24 | /* XXX error report? */ | ||
25 | struct work_struct t_conn_w; | ||
26 | struct work_struct t_send_w; | ||
27 | struct work_struct t_down_w; | ||
28 | struct work_struct t_recv_w; | ||
29 | |||
30 | /* for info exporting only */ | ||
31 | struct list_head t_list_item; | ||
32 | u32 t_last_sent_nxt; | ||
33 | u32 t_last_expected_una; | ||
34 | u32 t_last_seen_una; | ||
35 | }; | ||
36 | |||
37 | struct rds_tcp_statistics { | ||
38 | uint64_t s_tcp_data_ready_calls; | ||
39 | uint64_t s_tcp_write_space_calls; | ||
40 | uint64_t s_tcp_sndbuf_full; | ||
41 | uint64_t s_tcp_connect_raced; | ||
42 | uint64_t s_tcp_listen_closed_stale; | ||
43 | }; | ||
44 | |||
45 | /* tcp.c */ | ||
46 | int __init rds_tcp_init(void); | ||
47 | void rds_tcp_exit(void); | ||
48 | void rds_tcp_tune(struct socket *sock); | ||
49 | void rds_tcp_nonagle(struct socket *sock); | ||
50 | void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); | ||
51 | void rds_tcp_restore_callbacks(struct socket *sock, | ||
52 | struct rds_tcp_connection *tc); | ||
53 | u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); | ||
54 | u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); | ||
55 | u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); | ||
56 | extern struct rds_transport rds_tcp_transport; | ||
57 | |||
58 | /* tcp_connect.c */ | ||
59 | int rds_tcp_conn_connect(struct rds_connection *conn); | ||
60 | void rds_tcp_conn_shutdown(struct rds_connection *conn); | ||
61 | void rds_tcp_state_change(struct sock *sk); | ||
62 | |||
63 | /* tcp_listen.c */ | ||
64 | int __init rds_tcp_listen_init(void); | ||
65 | void rds_tcp_listen_stop(void); | ||
66 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes); | ||
67 | |||
68 | /* tcp_recv.c */ | ||
69 | int __init rds_tcp_recv_init(void); | ||
70 | void rds_tcp_recv_exit(void); | ||
71 | void rds_tcp_data_ready(struct sock *sk, int bytes); | ||
72 | int rds_tcp_recv(struct rds_connection *conn); | ||
73 | void rds_tcp_inc_purge(struct rds_incoming *inc); | ||
74 | void rds_tcp_inc_free(struct rds_incoming *inc); | ||
75 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | ||
76 | size_t size); | ||
77 | |||
78 | /* tcp_send.c */ | ||
79 | void rds_tcp_xmit_prepare(struct rds_connection *conn); | ||
80 | void rds_tcp_xmit_complete(struct rds_connection *conn); | ||
81 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
82 | unsigned int hdr_off, unsigned int sg, unsigned int off); | ||
83 | void rds_tcp_write_space(struct sock *sk); | ||
84 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
85 | struct rds_cong_map *map, unsigned long offset); | ||
86 | |||
87 | /* tcp_stats.c */ | ||
88 | DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); | ||
89 | #define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member) | ||
90 | unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, | ||
91 | unsigned int avail); | ||
92 | |||
93 | #endif | ||
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c new file mode 100644 index 000000000000..211522f9a9a2 --- /dev/null +++ b/net/rds/tcp_connect.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <net/tcp.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "tcp.h" | ||
39 | |||
40 | void rds_tcp_state_change(struct sock *sk) | ||
41 | { | ||
42 | void (*state_change)(struct sock *sk); | ||
43 | struct rds_connection *conn; | ||
44 | struct rds_tcp_connection *tc; | ||
45 | |||
46 | read_lock(&sk->sk_callback_lock); | ||
47 | conn = sk->sk_user_data; | ||
48 | if (conn == NULL) { | ||
49 | state_change = sk->sk_state_change; | ||
50 | goto out; | ||
51 | } | ||
52 | tc = conn->c_transport_data; | ||
53 | state_change = tc->t_orig_state_change; | ||
54 | |||
55 | rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); | ||
56 | |||
57 | switch(sk->sk_state) { | ||
58 | /* ignore connecting sockets as they make progress */ | ||
59 | case TCP_SYN_SENT: | ||
60 | case TCP_SYN_RECV: | ||
61 | break; | ||
62 | case TCP_ESTABLISHED: | ||
63 | rds_connect_complete(conn); | ||
64 | break; | ||
65 | case TCP_CLOSE: | ||
66 | rds_conn_drop(conn); | ||
67 | default: | ||
68 | break; | ||
69 | } | ||
70 | out: | ||
71 | read_unlock(&sk->sk_callback_lock); | ||
72 | state_change(sk); | ||
73 | } | ||
74 | |||
75 | int rds_tcp_conn_connect(struct rds_connection *conn) | ||
76 | { | ||
77 | struct socket *sock = NULL; | ||
78 | struct sockaddr_in src, dest; | ||
79 | int ret; | ||
80 | |||
81 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | ||
82 | if (ret < 0) | ||
83 | goto out; | ||
84 | |||
85 | rds_tcp_tune(sock); | ||
86 | |||
87 | src.sin_family = AF_INET; | ||
88 | src.sin_addr.s_addr = (__force u32)conn->c_laddr; | ||
89 | src.sin_port = (__force u16)htons(0); | ||
90 | |||
91 | ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); | ||
92 | if (ret) { | ||
93 | rdsdebug("bind failed with %d at address %u.%u.%u.%u\n", | ||
94 | ret, NIPQUAD(conn->c_laddr)); | ||
95 | goto out; | ||
96 | } | ||
97 | |||
98 | dest.sin_family = AF_INET; | ||
99 | dest.sin_addr.s_addr = (__force u32)conn->c_faddr; | ||
100 | dest.sin_port = (__force u16)htons(RDS_TCP_PORT); | ||
101 | |||
102 | /* | ||
103 | * once we call connect() we can start getting callbacks and they | ||
104 | * own the socket | ||
105 | */ | ||
106 | rds_tcp_set_callbacks(sock, conn); | ||
107 | ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), | ||
108 | O_NONBLOCK); | ||
109 | sock = NULL; | ||
110 | |||
111 | rdsdebug("connect to address %u.%u.%u.%u returned %d\n", | ||
112 | NIPQUAD(conn->c_faddr), ret); | ||
113 | if (ret == -EINPROGRESS) | ||
114 | ret = 0; | ||
115 | |||
116 | out: | ||
117 | if (sock) | ||
118 | sock_release(sock); | ||
119 | return ret; | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * Before killing the tcp socket this needs to serialize with callbacks. The | ||
124 | * caller has already grabbed the sending sem so we're serialized with other | ||
125 | * senders. | ||
126 | * | ||
127 | * TCP calls the callbacks with the sock lock so we hold it while we reset the | ||
128 | * callbacks to those set by TCP. Our callbacks won't execute again once we | ||
129 | * hold the sock lock. | ||
130 | */ | ||
131 | void rds_tcp_conn_shutdown(struct rds_connection *conn) | ||
132 | { | ||
133 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
134 | struct socket *sock = tc->t_sock; | ||
135 | |||
136 | rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); | ||
137 | |||
138 | if (sock) { | ||
139 | sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); | ||
140 | lock_sock(sock->sk); | ||
141 | rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ | ||
142 | |||
143 | release_sock(sock->sk); | ||
144 | sock_release(sock); | ||
145 | }; | ||
146 | |||
147 | if (tc->t_tinc) { | ||
148 | rds_inc_put(&tc->t_tinc->ti_inc); | ||
149 | tc->t_tinc = NULL; | ||
150 | } | ||
151 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
152 | tc->t_tinc_data_rem = 0; | ||
153 | } | ||
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c new file mode 100644 index 000000000000..24b743eb0b1b --- /dev/null +++ b/net/rds/tcp_listen.c | |||
@@ -0,0 +1,199 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <net/tcp.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "tcp.h" | ||
39 | |||
40 | /* | ||
41 | * cheesy, but simple.. | ||
42 | */ | ||
43 | static void rds_tcp_accept_worker(struct work_struct *work); | ||
44 | static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); | ||
45 | static struct socket *rds_tcp_listen_sock; | ||
46 | |||
47 | static int rds_tcp_accept_one(struct socket *sock) | ||
48 | { | ||
49 | struct socket *new_sock = NULL; | ||
50 | struct rds_connection *conn; | ||
51 | int ret; | ||
52 | struct inet_sock *inet; | ||
53 | |||
54 | ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, | ||
55 | sock->sk->sk_protocol, &new_sock); | ||
56 | if (ret) | ||
57 | goto out; | ||
58 | |||
59 | new_sock->type = sock->type; | ||
60 | new_sock->ops = sock->ops; | ||
61 | ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); | ||
62 | if (ret < 0) | ||
63 | goto out; | ||
64 | |||
65 | rds_tcp_tune(new_sock); | ||
66 | |||
67 | inet = inet_sk(new_sock->sk); | ||
68 | |||
69 | rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", | ||
70 | NIPQUAD(inet->saddr), ntohs(inet->sport), | ||
71 | NIPQUAD(inet->daddr), ntohs(inet->dport)); | ||
72 | |||
73 | conn = rds_conn_create(inet->saddr, inet->daddr, &rds_tcp_transport, | ||
74 | GFP_KERNEL); | ||
75 | if (IS_ERR(conn)) { | ||
76 | ret = PTR_ERR(conn); | ||
77 | goto out; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * see the comment above rds_queue_delayed_reconnect() | ||
82 | */ | ||
83 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { | ||
84 | if (rds_conn_state(conn) == RDS_CONN_UP) | ||
85 | rds_tcp_stats_inc(s_tcp_listen_closed_stale); | ||
86 | else | ||
87 | rds_tcp_stats_inc(s_tcp_connect_raced); | ||
88 | rds_conn_drop(conn); | ||
89 | ret = 0; | ||
90 | goto out; | ||
91 | } | ||
92 | |||
93 | rds_tcp_set_callbacks(new_sock, conn); | ||
94 | rds_connect_complete(conn); | ||
95 | new_sock = NULL; | ||
96 | ret = 0; | ||
97 | |||
98 | out: | ||
99 | if (new_sock) | ||
100 | sock_release(new_sock); | ||
101 | return ret; | ||
102 | } | ||
103 | |||
104 | static void rds_tcp_accept_worker(struct work_struct *work) | ||
105 | { | ||
106 | while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) | ||
107 | cond_resched(); | ||
108 | } | ||
109 | |||
110 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes) | ||
111 | { | ||
112 | void (*ready)(struct sock *sk, int bytes); | ||
113 | |||
114 | rdsdebug("listen data ready sk %p\n", sk); | ||
115 | |||
116 | read_lock(&sk->sk_callback_lock); | ||
117 | ready = sk->sk_user_data; | ||
118 | if (ready == NULL) { /* check for teardown race */ | ||
119 | ready = sk->sk_data_ready; | ||
120 | goto out; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * ->sk_data_ready is also called for a newly established child socket | ||
125 | * before it has been accepted and the accepter has set up their | ||
126 | * data_ready.. we only want to queue listen work for our listening | ||
127 | * socket | ||
128 | */ | ||
129 | if (sk->sk_state == TCP_LISTEN) | ||
130 | queue_work(rds_wq, &rds_tcp_listen_work); | ||
131 | |||
132 | out: | ||
133 | read_unlock(&sk->sk_callback_lock); | ||
134 | ready(sk, bytes); | ||
135 | } | ||
136 | |||
137 | int __init rds_tcp_listen_init(void) | ||
138 | { | ||
139 | struct sockaddr_in sin; | ||
140 | struct socket *sock = NULL; | ||
141 | int ret; | ||
142 | |||
143 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | ||
144 | if (ret < 0) | ||
145 | goto out; | ||
146 | |||
147 | sock->sk->sk_reuse = 1; | ||
148 | rds_tcp_nonagle(sock); | ||
149 | |||
150 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
151 | sock->sk->sk_user_data = sock->sk->sk_data_ready; | ||
152 | sock->sk->sk_data_ready = rds_tcp_listen_data_ready; | ||
153 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
154 | |||
155 | sin.sin_family = PF_INET, | ||
156 | sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); | ||
157 | sin.sin_port = (__force u16)htons(RDS_TCP_PORT); | ||
158 | |||
159 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); | ||
160 | if (ret < 0) | ||
161 | goto out; | ||
162 | |||
163 | ret = sock->ops->listen(sock, 64); | ||
164 | if (ret < 0) | ||
165 | goto out; | ||
166 | |||
167 | rds_tcp_listen_sock = sock; | ||
168 | sock = NULL; | ||
169 | out: | ||
170 | if (sock) | ||
171 | sock_release(sock); | ||
172 | return ret; | ||
173 | } | ||
174 | |||
175 | void rds_tcp_listen_stop(void) | ||
176 | { | ||
177 | struct socket *sock = rds_tcp_listen_sock; | ||
178 | struct sock *sk; | ||
179 | |||
180 | if (sock == NULL) | ||
181 | return; | ||
182 | |||
183 | sk = sock->sk; | ||
184 | |||
185 | /* serialize with and prevent further callbacks */ | ||
186 | lock_sock(sk); | ||
187 | write_lock_bh(&sk->sk_callback_lock); | ||
188 | if (sk->sk_user_data) { | ||
189 | sk->sk_data_ready = sk->sk_user_data; | ||
190 | sk->sk_user_data = NULL; | ||
191 | } | ||
192 | write_unlock_bh(&sk->sk_callback_lock); | ||
193 | release_sock(sk); | ||
194 | |||
195 | /* wait for accepts to stop and close the socket */ | ||
196 | flush_workqueue(rds_wq); | ||
197 | sock_release(sock); | ||
198 | rds_tcp_listen_sock = NULL; | ||
199 | } | ||
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c new file mode 100644 index 000000000000..c00dafffbb5a --- /dev/null +++ b/net/rds/tcp_recv.c | |||
@@ -0,0 +1,356 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <net/tcp.h> | ||
35 | |||
36 | #include "rds.h" | ||
37 | #include "tcp.h" | ||
38 | |||
39 | static struct kmem_cache *rds_tcp_incoming_slab; | ||
40 | |||
41 | void rds_tcp_inc_purge(struct rds_incoming *inc) | ||
42 | { | ||
43 | struct rds_tcp_incoming *tinc; | ||
44 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
45 | rdsdebug("purging tinc %p inc %p\n", tinc, inc); | ||
46 | skb_queue_purge(&tinc->ti_skb_list); | ||
47 | } | ||
48 | |||
49 | void rds_tcp_inc_free(struct rds_incoming *inc) | ||
50 | { | ||
51 | struct rds_tcp_incoming *tinc; | ||
52 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
53 | rds_tcp_inc_purge(inc); | ||
54 | rdsdebug("freeing tinc %p inc %p\n", tinc, inc); | ||
55 | kmem_cache_free(rds_tcp_incoming_slab, tinc); | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * this is pretty lame, but, whatever. | ||
60 | */ | ||
61 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | ||
62 | size_t size) | ||
63 | { | ||
64 | struct rds_tcp_incoming *tinc; | ||
65 | struct iovec *iov, tmp; | ||
66 | struct sk_buff *skb; | ||
67 | unsigned long to_copy, skb_off; | ||
68 | int ret = 0; | ||
69 | |||
70 | if (size == 0) | ||
71 | goto out; | ||
72 | |||
73 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
74 | iov = first_iov; | ||
75 | tmp = *iov; | ||
76 | |||
77 | skb_queue_walk(&tinc->ti_skb_list, skb) { | ||
78 | skb_off = 0; | ||
79 | while (skb_off < skb->len) { | ||
80 | while (tmp.iov_len == 0) { | ||
81 | iov++; | ||
82 | tmp = *iov; | ||
83 | } | ||
84 | |||
85 | to_copy = min(tmp.iov_len, size); | ||
86 | to_copy = min(to_copy, skb->len - skb_off); | ||
87 | |||
88 | rdsdebug("ret %d size %zu skb %p skb_off %lu " | ||
89 | "skblen %d iov_base %p iov_len %zu cpy %lu\n", | ||
90 | ret, size, skb, skb_off, skb->len, | ||
91 | tmp.iov_base, tmp.iov_len, to_copy); | ||
92 | |||
93 | /* modifies tmp as it copies */ | ||
94 | if (skb_copy_datagram_iovec(skb, skb_off, &tmp, | ||
95 | to_copy)) { | ||
96 | ret = -EFAULT; | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | size -= to_copy; | ||
101 | ret += to_copy; | ||
102 | skb_off += to_copy; | ||
103 | if (size == 0) | ||
104 | goto out; | ||
105 | } | ||
106 | } | ||
107 | out: | ||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * We have a series of skbs that have fragmented pieces of the congestion | ||
113 | * bitmap. They must add up to the exact size of the congestion bitmap. We | ||
114 | * use the skb helpers to copy those into the pages that make up the in-memory | ||
115 | * congestion bitmap for the remote address of this connection. We then tell | ||
116 | * the congestion core that the bitmap has been changed so that it can wake up | ||
117 | * sleepers. | ||
118 | * | ||
119 | * This is racing with sending paths which are using test_bit to see if the | ||
120 | * bitmap indicates that their recipient is congested. | ||
121 | */ | ||
122 | |||
123 | static void rds_tcp_cong_recv(struct rds_connection *conn, | ||
124 | struct rds_tcp_incoming *tinc) | ||
125 | { | ||
126 | struct sk_buff *skb; | ||
127 | unsigned int to_copy, skb_off; | ||
128 | unsigned int map_off; | ||
129 | unsigned int map_page; | ||
130 | struct rds_cong_map *map; | ||
131 | int ret; | ||
132 | |||
133 | /* catch completely corrupt packets */ | ||
134 | if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) | ||
135 | return; | ||
136 | |||
137 | map_page = 0; | ||
138 | map_off = 0; | ||
139 | map = conn->c_fcong; | ||
140 | |||
141 | skb_queue_walk(&tinc->ti_skb_list, skb) { | ||
142 | skb_off = 0; | ||
143 | while (skb_off < skb->len) { | ||
144 | to_copy = min_t(unsigned int, PAGE_SIZE - map_off, | ||
145 | skb->len - skb_off); | ||
146 | |||
147 | BUG_ON(map_page >= RDS_CONG_MAP_PAGES); | ||
148 | |||
149 | /* only returns 0 or -error */ | ||
150 | ret = skb_copy_bits(skb, skb_off, | ||
151 | (void *)map->m_page_addrs[map_page] + map_off, | ||
152 | to_copy); | ||
153 | BUG_ON(ret != 0); | ||
154 | |||
155 | skb_off += to_copy; | ||
156 | map_off += to_copy; | ||
157 | if (map_off == PAGE_SIZE) { | ||
158 | map_off = 0; | ||
159 | map_page++; | ||
160 | } | ||
161 | } | ||
162 | } | ||
163 | |||
164 | rds_cong_map_updated(map, ~(u64) 0); | ||
165 | } | ||
166 | |||
167 | struct rds_tcp_desc_arg { | ||
168 | struct rds_connection *conn; | ||
169 | gfp_t gfp; | ||
170 | enum km_type km; | ||
171 | }; | ||
172 | |||
173 | static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | ||
174 | unsigned int offset, size_t len) | ||
175 | { | ||
176 | struct rds_tcp_desc_arg *arg = desc->arg.data; | ||
177 | struct rds_connection *conn = arg->conn; | ||
178 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
179 | struct rds_tcp_incoming *tinc = tc->t_tinc; | ||
180 | struct sk_buff *clone; | ||
181 | size_t left = len, to_copy; | ||
182 | |||
183 | rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, | ||
184 | len); | ||
185 | |||
186 | /* | ||
187 | * tcp_read_sock() interprets partial progress as an indication to stop | ||
188 | * processing. | ||
189 | */ | ||
190 | while (left) { | ||
191 | if (tinc == NULL) { | ||
192 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, | ||
193 | arg->gfp); | ||
194 | if (tinc == NULL) { | ||
195 | desc->error = -ENOMEM; | ||
196 | goto out; | ||
197 | } | ||
198 | tc->t_tinc = tinc; | ||
199 | rdsdebug("alloced tinc %p\n", tinc); | ||
200 | rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); | ||
201 | /* | ||
202 | * XXX * we might be able to use the __ variants when | ||
203 | * we've already serialized at a higher level. | ||
204 | */ | ||
205 | skb_queue_head_init(&tinc->ti_skb_list); | ||
206 | } | ||
207 | |||
208 | if (left && tc->t_tinc_hdr_rem) { | ||
209 | to_copy = min(tc->t_tinc_hdr_rem, left); | ||
210 | rdsdebug("copying %zu header from skb %p\n", to_copy, | ||
211 | skb); | ||
212 | skb_copy_bits(skb, offset, | ||
213 | (char *)&tinc->ti_inc.i_hdr + | ||
214 | sizeof(struct rds_header) - | ||
215 | tc->t_tinc_hdr_rem, | ||
216 | to_copy); | ||
217 | tc->t_tinc_hdr_rem -= to_copy; | ||
218 | left -= to_copy; | ||
219 | offset += to_copy; | ||
220 | |||
221 | if (tc->t_tinc_hdr_rem == 0) { | ||
222 | /* could be 0 for a 0 len message */ | ||
223 | tc->t_tinc_data_rem = | ||
224 | be32_to_cpu(tinc->ti_inc.i_hdr.h_len); | ||
225 | } | ||
226 | } | ||
227 | |||
228 | if (left && tc->t_tinc_data_rem) { | ||
229 | clone = skb_clone(skb, arg->gfp); | ||
230 | if (clone == NULL) { | ||
231 | desc->error = -ENOMEM; | ||
232 | goto out; | ||
233 | } | ||
234 | |||
235 | to_copy = min(tc->t_tinc_data_rem, left); | ||
236 | pskb_pull(clone, offset); | ||
237 | pskb_trim(clone, to_copy); | ||
238 | skb_queue_tail(&tinc->ti_skb_list, clone); | ||
239 | |||
240 | rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " | ||
241 | "clone %p data %p len %d\n", | ||
242 | skb, skb->data, skb->len, offset, to_copy, | ||
243 | clone, clone->data, clone->len); | ||
244 | |||
245 | tc->t_tinc_data_rem -= to_copy; | ||
246 | left -= to_copy; | ||
247 | offset += to_copy; | ||
248 | } | ||
249 | |||
250 | if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { | ||
251 | if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) | ||
252 | rds_tcp_cong_recv(conn, tinc); | ||
253 | else | ||
254 | rds_recv_incoming(conn, conn->c_faddr, | ||
255 | conn->c_laddr, &tinc->ti_inc, | ||
256 | arg->gfp, arg->km); | ||
257 | |||
258 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
259 | tc->t_tinc_data_rem = 0; | ||
260 | tc->t_tinc = NULL; | ||
261 | rds_inc_put(&tinc->ti_inc); | ||
262 | tinc = NULL; | ||
263 | } | ||
264 | } | ||
265 | out: | ||
266 | rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", | ||
267 | len, left, skb->len, | ||
268 | skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); | ||
269 | return len - left; | ||
270 | } | ||
271 | |||
272 | /* the caller has to hold the sock lock */ | ||
273 | int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) | ||
274 | { | ||
275 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
276 | struct socket *sock = tc->t_sock; | ||
277 | read_descriptor_t desc; | ||
278 | struct rds_tcp_desc_arg arg; | ||
279 | |||
280 | /* It's like glib in the kernel! */ | ||
281 | arg.conn = conn; | ||
282 | arg.gfp = gfp; | ||
283 | arg.km = km; | ||
284 | desc.arg.data = &arg; | ||
285 | desc.error = 0; | ||
286 | desc.count = 1; /* give more than one skb per call */ | ||
287 | |||
288 | tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); | ||
289 | rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, | ||
290 | desc.error); | ||
291 | |||
292 | return desc.error; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from | ||
297 | * data_ready. | ||
298 | * | ||
299 | * if we fail to allocate we're in trouble.. blindly wait some time before | ||
300 | * trying again to see if the VM can free up something for us. | ||
301 | */ | ||
302 | int rds_tcp_recv(struct rds_connection *conn) | ||
303 | { | ||
304 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
305 | struct socket *sock = tc->t_sock; | ||
306 | int ret = 0; | ||
307 | |||
308 | rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); | ||
309 | |||
310 | lock_sock(sock->sk); | ||
311 | ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); | ||
312 | release_sock(sock->sk); | ||
313 | |||
314 | return ret; | ||
315 | } | ||
316 | |||
317 | void rds_tcp_data_ready(struct sock *sk, int bytes) | ||
318 | { | ||
319 | void (*ready)(struct sock *sk, int bytes); | ||
320 | struct rds_connection *conn; | ||
321 | struct rds_tcp_connection *tc; | ||
322 | |||
323 | rdsdebug("data ready sk %p bytes %d\n", sk, bytes); | ||
324 | |||
325 | read_lock(&sk->sk_callback_lock); | ||
326 | conn = sk->sk_user_data; | ||
327 | if (conn == NULL) { /* check for teardown race */ | ||
328 | ready = sk->sk_data_ready; | ||
329 | goto out; | ||
330 | } | ||
331 | |||
332 | tc = conn->c_transport_data; | ||
333 | ready = tc->t_orig_data_ready; | ||
334 | rds_tcp_stats_inc(s_tcp_data_ready_calls); | ||
335 | |||
336 | if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) | ||
337 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
338 | out: | ||
339 | read_unlock(&sk->sk_callback_lock); | ||
340 | ready(sk, bytes); | ||
341 | } | ||
342 | |||
343 | int __init rds_tcp_recv_init(void) | ||
344 | { | ||
345 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", | ||
346 | sizeof(struct rds_tcp_incoming), | ||
347 | 0, 0, NULL); | ||
348 | if (rds_tcp_incoming_slab == NULL) | ||
349 | return -ENOMEM; | ||
350 | return 0; | ||
351 | } | ||
352 | |||
353 | void rds_tcp_recv_exit(void) | ||
354 | { | ||
355 | kmem_cache_destroy(rds_tcp_incoming_slab); | ||
356 | } | ||
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c new file mode 100644 index 000000000000..ab545e0cd5d6 --- /dev/null +++ b/net/rds/tcp_send.c | |||
@@ -0,0 +1,263 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <net/tcp.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "tcp.h" | ||
39 | |||
40 | static void rds_tcp_cork(struct socket *sock, int val) | ||
41 | { | ||
42 | mm_segment_t oldfs; | ||
43 | |||
44 | oldfs = get_fs(); | ||
45 | set_fs(KERNEL_DS); | ||
46 | sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val, | ||
47 | sizeof(val)); | ||
48 | set_fs(oldfs); | ||
49 | } | ||
50 | |||
51 | void rds_tcp_xmit_prepare(struct rds_connection *conn) | ||
52 | { | ||
53 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
54 | |||
55 | rds_tcp_cork(tc->t_sock, 1); | ||
56 | } | ||
57 | |||
58 | void rds_tcp_xmit_complete(struct rds_connection *conn) | ||
59 | { | ||
60 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
61 | |||
62 | rds_tcp_cork(tc->t_sock, 0); | ||
63 | } | ||
64 | |||
65 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
66 | int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) | ||
67 | { | ||
68 | struct kvec vec = { | ||
69 | .iov_base = data, | ||
70 | .iov_len = len, | ||
71 | }; | ||
72 | struct msghdr msg = { | ||
73 | .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, | ||
74 | }; | ||
75 | |||
76 | return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); | ||
77 | } | ||
78 | |||
79 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
80 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
81 | struct rds_cong_map *map, unsigned long offset) | ||
82 | { | ||
83 | static struct rds_header rds_tcp_map_header = { | ||
84 | .h_flags = RDS_FLAG_CONG_BITMAP, | ||
85 | }; | ||
86 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
87 | unsigned long i; | ||
88 | int ret; | ||
89 | int copied = 0; | ||
90 | |||
91 | /* Some problem claims cpu_to_be32(constant) isn't a constant. */ | ||
92 | rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); | ||
93 | |||
94 | if (offset < sizeof(struct rds_header)) { | ||
95 | ret = rds_tcp_sendmsg(tc->t_sock, | ||
96 | (void *)&rds_tcp_map_header + offset, | ||
97 | sizeof(struct rds_header) - offset); | ||
98 | if (ret <= 0) | ||
99 | return ret; | ||
100 | offset += ret; | ||
101 | copied = ret; | ||
102 | if (offset < sizeof(struct rds_header)) | ||
103 | return ret; | ||
104 | } | ||
105 | |||
106 | offset -= sizeof(struct rds_header); | ||
107 | i = offset / PAGE_SIZE; | ||
108 | offset = offset % PAGE_SIZE; | ||
109 | BUG_ON(i >= RDS_CONG_MAP_PAGES); | ||
110 | |||
111 | do { | ||
112 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | ||
113 | virt_to_page(map->m_page_addrs[i]), | ||
114 | offset, PAGE_SIZE - offset, | ||
115 | MSG_DONTWAIT); | ||
116 | if (ret <= 0) | ||
117 | break; | ||
118 | copied += ret; | ||
119 | offset += ret; | ||
120 | if (offset == PAGE_SIZE) { | ||
121 | offset = 0; | ||
122 | i++; | ||
123 | } | ||
124 | } while (i < RDS_CONG_MAP_PAGES); | ||
125 | |||
126 | return copied ? copied : ret; | ||
127 | } | ||
128 | |||
129 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
130 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
131 | unsigned int hdr_off, unsigned int sg, unsigned int off) | ||
132 | { | ||
133 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
134 | int done = 0; | ||
135 | int ret = 0; | ||
136 | |||
137 | if (hdr_off == 0) { | ||
138 | /* | ||
139 | * m_ack_seq is set to the sequence number of the last byte of | ||
140 | * header and data. see rds_tcp_is_acked(). | ||
141 | */ | ||
142 | tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); | ||
143 | rm->m_ack_seq = tc->t_last_sent_nxt + | ||
144 | sizeof(struct rds_header) + | ||
145 | be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; | ||
146 | smp_mb__before_clear_bit(); | ||
147 | set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); | ||
148 | tc->t_last_expected_una = rm->m_ack_seq + 1; | ||
149 | |||
150 | rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", | ||
151 | rm, rds_tcp_snd_nxt(tc), | ||
152 | (unsigned long long)rm->m_ack_seq); | ||
153 | } | ||
154 | |||
155 | if (hdr_off < sizeof(struct rds_header)) { | ||
156 | /* see rds_tcp_write_space() */ | ||
157 | set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); | ||
158 | |||
159 | ret = rds_tcp_sendmsg(tc->t_sock, | ||
160 | (void *)&rm->m_inc.i_hdr + hdr_off, | ||
161 | sizeof(rm->m_inc.i_hdr) - hdr_off); | ||
162 | if (ret < 0) | ||
163 | goto out; | ||
164 | done += ret; | ||
165 | if (hdr_off + done != sizeof(struct rds_header)) | ||
166 | goto out; | ||
167 | } | ||
168 | |||
169 | while (sg < rm->m_nents) { | ||
170 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | ||
171 | sg_page(&rm->m_sg[sg]), | ||
172 | rm->m_sg[sg].offset + off, | ||
173 | rm->m_sg[sg].length - off, | ||
174 | MSG_DONTWAIT|MSG_NOSIGNAL); | ||
175 | rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), | ||
176 | rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, | ||
177 | ret); | ||
178 | if (ret <= 0) | ||
179 | break; | ||
180 | |||
181 | off += ret; | ||
182 | done += ret; | ||
183 | if (off == rm->m_sg[sg].length) { | ||
184 | off = 0; | ||
185 | sg++; | ||
186 | } | ||
187 | } | ||
188 | |||
189 | out: | ||
190 | if (ret <= 0) { | ||
191 | /* write_space will hit after EAGAIN, all else fatal */ | ||
192 | if (ret == -EAGAIN) { | ||
193 | rds_tcp_stats_inc(s_tcp_sndbuf_full); | ||
194 | ret = 0; | ||
195 | } else { | ||
196 | printk(KERN_WARNING "RDS/tcp: send to %u.%u.%u.%u " | ||
197 | "returned %d, disconnecting and reconnecting\n", | ||
198 | NIPQUAD(conn->c_faddr), ret); | ||
199 | rds_conn_drop(conn); | ||
200 | } | ||
201 | } | ||
202 | if (done == 0) | ||
203 | done = ret; | ||
204 | return done; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * rm->m_ack_seq is set to the tcp sequence number that corresponds to the | ||
209 | * last byte of the message, including the header. This means that the | ||
210 | * entire message has been received if rm->m_ack_seq is "before" the next | ||
211 | * unacked byte of the TCP sequence space. We have to do very careful | ||
212 | * wrapping 32bit comparisons here. | ||
213 | */ | ||
214 | static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) | ||
215 | { | ||
216 | if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) | ||
217 | return 0; | ||
218 | return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; | ||
219 | } | ||
220 | |||
221 | void rds_tcp_write_space(struct sock *sk) | ||
222 | { | ||
223 | void (*write_space)(struct sock *sk); | ||
224 | struct rds_connection *conn; | ||
225 | struct rds_tcp_connection *tc; | ||
226 | |||
227 | read_lock(&sk->sk_callback_lock); | ||
228 | conn = sk->sk_user_data; | ||
229 | if (conn == NULL) { | ||
230 | write_space = sk->sk_write_space; | ||
231 | goto out; | ||
232 | } | ||
233 | |||
234 | tc = conn->c_transport_data; | ||
235 | rdsdebug("write_space for tc %p\n", tc); | ||
236 | write_space = tc->t_orig_write_space; | ||
237 | rds_tcp_stats_inc(s_tcp_write_space_calls); | ||
238 | |||
239 | rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); | ||
240 | tc->t_last_seen_una = rds_tcp_snd_una(tc); | ||
241 | rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); | ||
242 | |||
243 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
244 | out: | ||
245 | read_unlock(&sk->sk_callback_lock); | ||
246 | |||
247 | /* | ||
248 | * write_space is only called when data leaves tcp's send queue if | ||
249 | * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put | ||
250 | * data in tcp's send queue because we use write_space to parse the | ||
251 | * sequence numbers and notice that rds messages have been fully | ||
252 | * received. | ||
253 | * | ||
254 | * tcp's write_space clears SOCK_NOSPACE if the send queue has more | ||
255 | * than a certain amount of space. So we need to set it again *after* | ||
256 | * we call tcp's write_space or else we might only get called on the | ||
257 | * first of a series of incoming tcp acks. | ||
258 | */ | ||
259 | write_space(sk); | ||
260 | |||
261 | if (sk->sk_socket) | ||
262 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
263 | } | ||
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c new file mode 100644 index 000000000000..d5898d03cd68 --- /dev/null +++ b/net/rds/tcp_stats.c | |||
@@ -0,0 +1,74 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/percpu.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "tcp.h" | ||
39 | |||
40 | DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) | ||
41 | ____cacheline_aligned; | ||
42 | |||
43 | static const char const *rds_tcp_stat_names[] = { | ||
44 | "tcp_data_ready_calls", | ||
45 | "tcp_write_space_calls", | ||
46 | "tcp_sndbuf_full", | ||
47 | "tcp_connect_raced", | ||
48 | "tcp_listen_closed_stale", | ||
49 | }; | ||
50 | |||
51 | unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, | ||
52 | unsigned int avail) | ||
53 | { | ||
54 | struct rds_tcp_statistics stats = {0, }; | ||
55 | uint64_t *src; | ||
56 | uint64_t *sum; | ||
57 | size_t i; | ||
58 | int cpu; | ||
59 | |||
60 | if (avail < ARRAY_SIZE(rds_tcp_stat_names)) | ||
61 | goto out; | ||
62 | |||
63 | for_each_online_cpu(cpu) { | ||
64 | src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); | ||
65 | sum = (uint64_t *)&stats; | ||
66 | for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) | ||
67 | *(sum++) += *(src++); | ||
68 | } | ||
69 | |||
70 | rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, | ||
71 | ARRAY_SIZE(rds_tcp_stat_names)); | ||
72 | out: | ||
73 | return ARRAY_SIZE(rds_tcp_stat_names); | ||
74 | } | ||