aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
authorAndy Grover <andy.grover@oracle.com>2009-08-21 08:28:31 -0400
committerDavid S. Miller <davem@davemloft.net>2009-08-23 22:13:02 -0400
commit70041088e3b976627ba9a183b812f39ef8a9ba0e (patch)
treebad7b11763d7b02b185bd705fe5ed292397cbc7a /net/rds
parent7d6fd5e7e97a2188d56441e4e96494c21c5994a7 (diff)
RDS: Add TCP transport to RDS
This code allows RDS to be tunneled over a TCP connection. RDMA operations are disabled when using TCP transport, but this frees RDS from the IB/RDMA stack dependency, and allows it to be used with standard Ethernet adapters, or in a VM. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/tcp.c319
-rw-r--r--net/rds/tcp.h93
-rw-r--r--net/rds/tcp_connect.c153
-rw-r--r--net/rds/tcp_listen.c199
-rw-r--r--net/rds/tcp_recv.c356
-rw-r--r--net/rds/tcp_send.c263
-rw-r--r--net/rds/tcp_stats.c74
7 files changed, 1457 insertions, 0 deletions
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 000000000000..e0ac9009db1a
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,319 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <net/tcp.h>
36
37#include "rds.h"
38#include "tcp.h"
39
40/* only for info exporting */
41static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
42static LIST_HEAD(rds_tcp_tc_list);
43unsigned int rds_tcp_tc_count;
44
45/* Track rds_tcp_connection structs so they can be cleaned up */
46static DEFINE_SPINLOCK(rds_tcp_conn_lock);
47static LIST_HEAD(rds_tcp_conn_list);
48
49static struct kmem_cache *rds_tcp_conn_slab;
50
51#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
52
53/* doing it this way avoids calling tcp_sk() */
54void rds_tcp_nonagle(struct socket *sock)
55{
56 mm_segment_t oldfs = get_fs();
57 int val = 1;
58
59 set_fs(KERNEL_DS);
60 sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
61 sizeof(val));
62 set_fs(oldfs);
63}
64
65void rds_tcp_tune(struct socket *sock)
66{
67 struct sock *sk = sock->sk;
68
69 rds_tcp_nonagle(sock);
70
71 /*
72 * We're trying to saturate gigabit with the default,
73 * see svc_sock_setbufsize().
74 */
75 lock_sock(sk);
76 sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
77 sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
78 sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
79 release_sock(sk);
80}
81
82u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
83{
84 return tcp_sk(tc->t_sock->sk)->snd_nxt;
85}
86
87u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
88{
89 return tcp_sk(tc->t_sock->sk)->snd_una;
90}
91
92void rds_tcp_restore_callbacks(struct socket *sock,
93 struct rds_tcp_connection *tc)
94{
95 rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
96 write_lock_bh(&sock->sk->sk_callback_lock);
97
98 /* done under the callback_lock to serialize with write_space */
99 spin_lock(&rds_tcp_tc_list_lock);
100 list_del_init(&tc->t_list_item);
101 rds_tcp_tc_count--;
102 spin_unlock(&rds_tcp_tc_list_lock);
103
104 tc->t_sock = NULL;
105
106 sock->sk->sk_write_space = tc->t_orig_write_space;
107 sock->sk->sk_data_ready = tc->t_orig_data_ready;
108 sock->sk->sk_state_change = tc->t_orig_state_change;
109 sock->sk->sk_user_data = NULL;
110
111 write_unlock_bh(&sock->sk->sk_callback_lock);
112}
113
114/*
115 * This is the only path that sets tc->t_sock. Send and receive trust that
116 * it is set. The RDS_CONN_CONNECTED bit protects those paths from being
117 * called while it isn't set.
118 */
119void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
120{
121 struct rds_tcp_connection *tc = conn->c_transport_data;
122
123 rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
124 write_lock_bh(&sock->sk->sk_callback_lock);
125
126 /* done under the callback_lock to serialize with write_space */
127 spin_lock(&rds_tcp_tc_list_lock);
128 list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
129 rds_tcp_tc_count++;
130 spin_unlock(&rds_tcp_tc_list_lock);
131
132 /* accepted sockets need our listen data ready undone */
133 if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
134 sock->sk->sk_data_ready = sock->sk->sk_user_data;
135
136 tc->t_sock = sock;
137 tc->conn = conn;
138 tc->t_orig_data_ready = sock->sk->sk_data_ready;
139 tc->t_orig_write_space = sock->sk->sk_write_space;
140 tc->t_orig_state_change = sock->sk->sk_state_change;
141
142 sock->sk->sk_user_data = conn;
143 sock->sk->sk_data_ready = rds_tcp_data_ready;
144 sock->sk->sk_write_space = rds_tcp_write_space;
145 sock->sk->sk_state_change = rds_tcp_state_change;
146
147 write_unlock_bh(&sock->sk->sk_callback_lock);
148}
149
150static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
151 struct rds_info_iterator *iter,
152 struct rds_info_lengths *lens)
153{
154 struct rds_info_tcp_socket tsinfo;
155 struct rds_tcp_connection *tc;
156 unsigned long flags;
157 struct sockaddr_in sin;
158 int sinlen;
159
160 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
161
162 if (len / sizeof(tsinfo) < rds_tcp_tc_count)
163 goto out;
164
165 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
166
167 sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
168 tsinfo.local_addr = sin.sin_addr.s_addr;
169 tsinfo.local_port = sin.sin_port;
170 sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
171 tsinfo.peer_addr = sin.sin_addr.s_addr;
172 tsinfo.peer_port = sin.sin_port;
173
174 tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
175 tsinfo.data_rem = tc->t_tinc_data_rem;
176 tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
177 tsinfo.last_expected_una = tc->t_last_expected_una;
178 tsinfo.last_seen_una = tc->t_last_seen_una;
179
180 rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
181 }
182
183out:
184 lens->nr = rds_tcp_tc_count;
185 lens->each = sizeof(tsinfo);
186
187 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
188}
189
190static int rds_tcp_laddr_check(__be32 addr)
191{
192 if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
193 return 0;
194 return -EADDRNOTAVAIL;
195}
196
197static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
198{
199 struct rds_tcp_connection *tc;
200
201 tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
202 if (tc == NULL)
203 return -ENOMEM;
204
205 tc->t_sock = NULL;
206 tc->t_tinc = NULL;
207 tc->t_tinc_hdr_rem = sizeof(struct rds_header);
208 tc->t_tinc_data_rem = 0;
209
210 conn->c_transport_data = tc;
211
212 spin_lock_irq(&rds_tcp_conn_lock);
213 list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
214 spin_unlock_irq(&rds_tcp_conn_lock);
215
216 rdsdebug("alloced tc %p\n", conn->c_transport_data);
217 return 0;
218}
219
220static void rds_tcp_conn_free(void *arg)
221{
222 struct rds_tcp_connection *tc = arg;
223 rdsdebug("freeing tc %p\n", tc);
224 kmem_cache_free(rds_tcp_conn_slab, tc);
225}
226
227static void rds_tcp_destroy_conns(void)
228{
229 struct rds_tcp_connection *tc, *_tc;
230 LIST_HEAD(tmp_list);
231
232 /* avoid calling conn_destroy with irqs off */
233 spin_lock_irq(&rds_tcp_conn_lock);
234 list_splice(&rds_tcp_conn_list, &tmp_list);
235 INIT_LIST_HEAD(&rds_tcp_conn_list);
236 spin_unlock_irq(&rds_tcp_conn_lock);
237
238 list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
239 if (tc->conn->c_passive)
240 rds_conn_destroy(tc->conn->c_passive);
241 rds_conn_destroy(tc->conn);
242 }
243}
244
245void rds_tcp_exit(void)
246{
247 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
248 rds_tcp_listen_stop();
249 rds_tcp_destroy_conns();
250 rds_trans_unregister(&rds_tcp_transport);
251 rds_tcp_recv_exit();
252 kmem_cache_destroy(rds_tcp_conn_slab);
253}
254module_exit(rds_tcp_exit);
255
256struct rds_transport rds_tcp_transport = {
257 .laddr_check = rds_tcp_laddr_check,
258 .xmit_prepare = rds_tcp_xmit_prepare,
259 .xmit_complete = rds_tcp_xmit_complete,
260 .xmit_cong_map = rds_tcp_xmit_cong_map,
261 .xmit = rds_tcp_xmit,
262 .recv = rds_tcp_recv,
263 .conn_alloc = rds_tcp_conn_alloc,
264 .conn_free = rds_tcp_conn_free,
265 .conn_connect = rds_tcp_conn_connect,
266 .conn_shutdown = rds_tcp_conn_shutdown,
267 .inc_copy_to_user = rds_tcp_inc_copy_to_user,
268 .inc_purge = rds_tcp_inc_purge,
269 .inc_free = rds_tcp_inc_free,
270 .stats_info_copy = rds_tcp_stats_info_copy,
271 .exit = rds_tcp_exit,
272 .t_owner = THIS_MODULE,
273 .t_name = "tcp",
274 .t_prefer_loopback = 1,
275};
276
277int __init rds_tcp_init(void)
278{
279 int ret;
280
281 rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
282 sizeof(struct rds_tcp_connection),
283 0, 0, NULL);
284 if (rds_tcp_conn_slab == NULL) {
285 ret = -ENOMEM;
286 goto out;
287 }
288
289 ret = rds_tcp_recv_init();
290 if (ret)
291 goto out_slab;
292
293 ret = rds_trans_register(&rds_tcp_transport);
294 if (ret)
295 goto out_recv;
296
297 ret = rds_tcp_listen_init();
298 if (ret)
299 goto out_register;
300
301 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
302
303 goto out;
304
305out_register:
306 rds_trans_unregister(&rds_tcp_transport);
307out_recv:
308 rds_tcp_recv_exit();
309out_slab:
310 kmem_cache_destroy(rds_tcp_conn_slab);
311out:
312 return ret;
313}
314module_init(rds_tcp_init);
315
316MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
317MODULE_DESCRIPTION("RDS: TCP transport");
318MODULE_LICENSE("Dual BSD/GPL");
319
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 000000000000..844fa6b9cf5a
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,93 @@
1#ifndef _RDS_TCP_H
2#define _RDS_TCP_H
3
4#define RDS_TCP_PORT 16385
5
6struct rds_tcp_incoming {
7 struct rds_incoming ti_inc;
8 struct sk_buff_head ti_skb_list;
9};
10
11struct rds_tcp_connection {
12
13 struct list_head t_tcp_node;
14 struct rds_connection *conn;
15 struct socket *t_sock;
16 void *t_orig_write_space;
17 void *t_orig_data_ready;
18 void *t_orig_state_change;
19
20 struct rds_tcp_incoming *t_tinc;
21 size_t t_tinc_hdr_rem;
22 size_t t_tinc_data_rem;
23
24 /* XXX error report? */
25 struct work_struct t_conn_w;
26 struct work_struct t_send_w;
27 struct work_struct t_down_w;
28 struct work_struct t_recv_w;
29
30 /* for info exporting only */
31 struct list_head t_list_item;
32 u32 t_last_sent_nxt;
33 u32 t_last_expected_una;
34 u32 t_last_seen_una;
35};
36
37struct rds_tcp_statistics {
38 uint64_t s_tcp_data_ready_calls;
39 uint64_t s_tcp_write_space_calls;
40 uint64_t s_tcp_sndbuf_full;
41 uint64_t s_tcp_connect_raced;
42 uint64_t s_tcp_listen_closed_stale;
43};
44
45/* tcp.c */
46int __init rds_tcp_init(void);
47void rds_tcp_exit(void);
48void rds_tcp_tune(struct socket *sock);
49void rds_tcp_nonagle(struct socket *sock);
50void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
51void rds_tcp_restore_callbacks(struct socket *sock,
52 struct rds_tcp_connection *tc);
53u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
54u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
55u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
56extern struct rds_transport rds_tcp_transport;
57
58/* tcp_connect.c */
59int rds_tcp_conn_connect(struct rds_connection *conn);
60void rds_tcp_conn_shutdown(struct rds_connection *conn);
61void rds_tcp_state_change(struct sock *sk);
62
63/* tcp_listen.c */
64int __init rds_tcp_listen_init(void);
65void rds_tcp_listen_stop(void);
66void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
67
68/* tcp_recv.c */
69int __init rds_tcp_recv_init(void);
70void rds_tcp_recv_exit(void);
71void rds_tcp_data_ready(struct sock *sk, int bytes);
72int rds_tcp_recv(struct rds_connection *conn);
73void rds_tcp_inc_purge(struct rds_incoming *inc);
74void rds_tcp_inc_free(struct rds_incoming *inc);
75int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
76 size_t size);
77
78/* tcp_send.c */
79void rds_tcp_xmit_prepare(struct rds_connection *conn);
80void rds_tcp_xmit_complete(struct rds_connection *conn);
81int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
82 unsigned int hdr_off, unsigned int sg, unsigned int off);
83void rds_tcp_write_space(struct sock *sk);
84int rds_tcp_xmit_cong_map(struct rds_connection *conn,
85 struct rds_cong_map *map, unsigned long offset);
86
87/* tcp_stats.c */
88DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
89#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
90unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
91 unsigned int avail);
92
93#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 000000000000..211522f9a9a2
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <net/tcp.h>
36
37#include "rds.h"
38#include "tcp.h"
39
40void rds_tcp_state_change(struct sock *sk)
41{
42 void (*state_change)(struct sock *sk);
43 struct rds_connection *conn;
44 struct rds_tcp_connection *tc;
45
46 read_lock(&sk->sk_callback_lock);
47 conn = sk->sk_user_data;
48 if (conn == NULL) {
49 state_change = sk->sk_state_change;
50 goto out;
51 }
52 tc = conn->c_transport_data;
53 state_change = tc->t_orig_state_change;
54
55 rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
56
57 switch(sk->sk_state) {
58 /* ignore connecting sockets as they make progress */
59 case TCP_SYN_SENT:
60 case TCP_SYN_RECV:
61 break;
62 case TCP_ESTABLISHED:
63 rds_connect_complete(conn);
64 break;
65 case TCP_CLOSE:
66 rds_conn_drop(conn);
67 default:
68 break;
69 }
70out:
71 read_unlock(&sk->sk_callback_lock);
72 state_change(sk);
73}
74
75int rds_tcp_conn_connect(struct rds_connection *conn)
76{
77 struct socket *sock = NULL;
78 struct sockaddr_in src, dest;
79 int ret;
80
81 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
82 if (ret < 0)
83 goto out;
84
85 rds_tcp_tune(sock);
86
87 src.sin_family = AF_INET;
88 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
89 src.sin_port = (__force u16)htons(0);
90
91 ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
92 if (ret) {
93 rdsdebug("bind failed with %d at address %u.%u.%u.%u\n",
94 ret, NIPQUAD(conn->c_laddr));
95 goto out;
96 }
97
98 dest.sin_family = AF_INET;
99 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
100 dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
101
102 /*
103 * once we call connect() we can start getting callbacks and they
104 * own the socket
105 */
106 rds_tcp_set_callbacks(sock, conn);
107 ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
108 O_NONBLOCK);
109 sock = NULL;
110
111 rdsdebug("connect to address %u.%u.%u.%u returned %d\n",
112 NIPQUAD(conn->c_faddr), ret);
113 if (ret == -EINPROGRESS)
114 ret = 0;
115
116out:
117 if (sock)
118 sock_release(sock);
119 return ret;
120}
121
122/*
123 * Before killing the tcp socket this needs to serialize with callbacks. The
124 * caller has already grabbed the sending sem so we're serialized with other
125 * senders.
126 *
127 * TCP calls the callbacks with the sock lock so we hold it while we reset the
128 * callbacks to those set by TCP. Our callbacks won't execute again once we
129 * hold the sock lock.
130 */
131void rds_tcp_conn_shutdown(struct rds_connection *conn)
132{
133 struct rds_tcp_connection *tc = conn->c_transport_data;
134 struct socket *sock = tc->t_sock;
135
136 rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
137
138 if (sock) {
139 sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
140 lock_sock(sock->sk);
141 rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
142
143 release_sock(sock->sk);
144 sock_release(sock);
145 };
146
147 if (tc->t_tinc) {
148 rds_inc_put(&tc->t_tinc->ti_inc);
149 tc->t_tinc = NULL;
150 }
151 tc->t_tinc_hdr_rem = sizeof(struct rds_header);
152 tc->t_tinc_data_rem = 0;
153}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 000000000000..24b743eb0b1b
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,199 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <net/tcp.h>
36
37#include "rds.h"
38#include "tcp.h"
39
40/*
41 * cheesy, but simple..
42 */
43static void rds_tcp_accept_worker(struct work_struct *work);
44static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
45static struct socket *rds_tcp_listen_sock;
46
47static int rds_tcp_accept_one(struct socket *sock)
48{
49 struct socket *new_sock = NULL;
50 struct rds_connection *conn;
51 int ret;
52 struct inet_sock *inet;
53
54 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
55 sock->sk->sk_protocol, &new_sock);
56 if (ret)
57 goto out;
58
59 new_sock->type = sock->type;
60 new_sock->ops = sock->ops;
61 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
62 if (ret < 0)
63 goto out;
64
65 rds_tcp_tune(new_sock);
66
67 inet = inet_sk(new_sock->sk);
68
69 rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",
70 NIPQUAD(inet->saddr), ntohs(inet->sport),
71 NIPQUAD(inet->daddr), ntohs(inet->dport));
72
73 conn = rds_conn_create(inet->saddr, inet->daddr, &rds_tcp_transport,
74 GFP_KERNEL);
75 if (IS_ERR(conn)) {
76 ret = PTR_ERR(conn);
77 goto out;
78 }
79
80 /*
81 * see the comment above rds_queue_delayed_reconnect()
82 */
83 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
84 if (rds_conn_state(conn) == RDS_CONN_UP)
85 rds_tcp_stats_inc(s_tcp_listen_closed_stale);
86 else
87 rds_tcp_stats_inc(s_tcp_connect_raced);
88 rds_conn_drop(conn);
89 ret = 0;
90 goto out;
91 }
92
93 rds_tcp_set_callbacks(new_sock, conn);
94 rds_connect_complete(conn);
95 new_sock = NULL;
96 ret = 0;
97
98out:
99 if (new_sock)
100 sock_release(new_sock);
101 return ret;
102}
103
104static void rds_tcp_accept_worker(struct work_struct *work)
105{
106 while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
107 cond_resched();
108}
109
110void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
111{
112 void (*ready)(struct sock *sk, int bytes);
113
114 rdsdebug("listen data ready sk %p\n", sk);
115
116 read_lock(&sk->sk_callback_lock);
117 ready = sk->sk_user_data;
118 if (ready == NULL) { /* check for teardown race */
119 ready = sk->sk_data_ready;
120 goto out;
121 }
122
123 /*
124 * ->sk_data_ready is also called for a newly established child socket
125 * before it has been accepted and the accepter has set up their
126 * data_ready.. we only want to queue listen work for our listening
127 * socket
128 */
129 if (sk->sk_state == TCP_LISTEN)
130 queue_work(rds_wq, &rds_tcp_listen_work);
131
132out:
133 read_unlock(&sk->sk_callback_lock);
134 ready(sk, bytes);
135}
136
137int __init rds_tcp_listen_init(void)
138{
139 struct sockaddr_in sin;
140 struct socket *sock = NULL;
141 int ret;
142
143 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
144 if (ret < 0)
145 goto out;
146
147 sock->sk->sk_reuse = 1;
148 rds_tcp_nonagle(sock);
149
150 write_lock_bh(&sock->sk->sk_callback_lock);
151 sock->sk->sk_user_data = sock->sk->sk_data_ready;
152 sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
153 write_unlock_bh(&sock->sk->sk_callback_lock);
154
155 sin.sin_family = PF_INET,
156 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
157 sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
158
159 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
160 if (ret < 0)
161 goto out;
162
163 ret = sock->ops->listen(sock, 64);
164 if (ret < 0)
165 goto out;
166
167 rds_tcp_listen_sock = sock;
168 sock = NULL;
169out:
170 if (sock)
171 sock_release(sock);
172 return ret;
173}
174
175void rds_tcp_listen_stop(void)
176{
177 struct socket *sock = rds_tcp_listen_sock;
178 struct sock *sk;
179
180 if (sock == NULL)
181 return;
182
183 sk = sock->sk;
184
185 /* serialize with and prevent further callbacks */
186 lock_sock(sk);
187 write_lock_bh(&sk->sk_callback_lock);
188 if (sk->sk_user_data) {
189 sk->sk_data_ready = sk->sk_user_data;
190 sk->sk_user_data = NULL;
191 }
192 write_unlock_bh(&sk->sk_callback_lock);
193 release_sock(sk);
194
195 /* wait for accepts to stop and close the socket */
196 flush_workqueue(rds_wq);
197 sock_release(sock);
198 rds_tcp_listen_sock = NULL;
199}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 000000000000..c00dafffbb5a
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,356 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/tcp.h>
35
36#include "rds.h"
37#include "tcp.h"
38
39static struct kmem_cache *rds_tcp_incoming_slab;
40
41void rds_tcp_inc_purge(struct rds_incoming *inc)
42{
43 struct rds_tcp_incoming *tinc;
44 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
45 rdsdebug("purging tinc %p inc %p\n", tinc, inc);
46 skb_queue_purge(&tinc->ti_skb_list);
47}
48
49void rds_tcp_inc_free(struct rds_incoming *inc)
50{
51 struct rds_tcp_incoming *tinc;
52 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
53 rds_tcp_inc_purge(inc);
54 rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
55 kmem_cache_free(rds_tcp_incoming_slab, tinc);
56}
57
58/*
59 * this is pretty lame, but, whatever.
60 */
61int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
62 size_t size)
63{
64 struct rds_tcp_incoming *tinc;
65 struct iovec *iov, tmp;
66 struct sk_buff *skb;
67 unsigned long to_copy, skb_off;
68 int ret = 0;
69
70 if (size == 0)
71 goto out;
72
73 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
74 iov = first_iov;
75 tmp = *iov;
76
77 skb_queue_walk(&tinc->ti_skb_list, skb) {
78 skb_off = 0;
79 while (skb_off < skb->len) {
80 while (tmp.iov_len == 0) {
81 iov++;
82 tmp = *iov;
83 }
84
85 to_copy = min(tmp.iov_len, size);
86 to_copy = min(to_copy, skb->len - skb_off);
87
88 rdsdebug("ret %d size %zu skb %p skb_off %lu "
89 "skblen %d iov_base %p iov_len %zu cpy %lu\n",
90 ret, size, skb, skb_off, skb->len,
91 tmp.iov_base, tmp.iov_len, to_copy);
92
93 /* modifies tmp as it copies */
94 if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
95 to_copy)) {
96 ret = -EFAULT;
97 goto out;
98 }
99
100 size -= to_copy;
101 ret += to_copy;
102 skb_off += to_copy;
103 if (size == 0)
104 goto out;
105 }
106 }
107out:
108 return ret;
109}
110
111/*
112 * We have a series of skbs that have fragmented pieces of the congestion
113 * bitmap. They must add up to the exact size of the congestion bitmap. We
114 * use the skb helpers to copy those into the pages that make up the in-memory
115 * congestion bitmap for the remote address of this connection. We then tell
116 * the congestion core that the bitmap has been changed so that it can wake up
117 * sleepers.
118 *
119 * This is racing with sending paths which are using test_bit to see if the
120 * bitmap indicates that their recipient is congested.
121 */
122
123static void rds_tcp_cong_recv(struct rds_connection *conn,
124 struct rds_tcp_incoming *tinc)
125{
126 struct sk_buff *skb;
127 unsigned int to_copy, skb_off;
128 unsigned int map_off;
129 unsigned int map_page;
130 struct rds_cong_map *map;
131 int ret;
132
133 /* catch completely corrupt packets */
134 if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
135 return;
136
137 map_page = 0;
138 map_off = 0;
139 map = conn->c_fcong;
140
141 skb_queue_walk(&tinc->ti_skb_list, skb) {
142 skb_off = 0;
143 while (skb_off < skb->len) {
144 to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
145 skb->len - skb_off);
146
147 BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
148
149 /* only returns 0 or -error */
150 ret = skb_copy_bits(skb, skb_off,
151 (void *)map->m_page_addrs[map_page] + map_off,
152 to_copy);
153 BUG_ON(ret != 0);
154
155 skb_off += to_copy;
156 map_off += to_copy;
157 if (map_off == PAGE_SIZE) {
158 map_off = 0;
159 map_page++;
160 }
161 }
162 }
163
164 rds_cong_map_updated(map, ~(u64) 0);
165}
166
167struct rds_tcp_desc_arg {
168 struct rds_connection *conn;
169 gfp_t gfp;
170 enum km_type km;
171};
172
173static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
174 unsigned int offset, size_t len)
175{
176 struct rds_tcp_desc_arg *arg = desc->arg.data;
177 struct rds_connection *conn = arg->conn;
178 struct rds_tcp_connection *tc = conn->c_transport_data;
179 struct rds_tcp_incoming *tinc = tc->t_tinc;
180 struct sk_buff *clone;
181 size_t left = len, to_copy;
182
183 rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
184 len);
185
186 /*
187 * tcp_read_sock() interprets partial progress as an indication to stop
188 * processing.
189 */
190 while (left) {
191 if (tinc == NULL) {
192 tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
193 arg->gfp);
194 if (tinc == NULL) {
195 desc->error = -ENOMEM;
196 goto out;
197 }
198 tc->t_tinc = tinc;
199 rdsdebug("alloced tinc %p\n", tinc);
200 rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
201 /*
202 * XXX * we might be able to use the __ variants when
203 * we've already serialized at a higher level.
204 */
205 skb_queue_head_init(&tinc->ti_skb_list);
206 }
207
208 if (left && tc->t_tinc_hdr_rem) {
209 to_copy = min(tc->t_tinc_hdr_rem, left);
210 rdsdebug("copying %zu header from skb %p\n", to_copy,
211 skb);
212 skb_copy_bits(skb, offset,
213 (char *)&tinc->ti_inc.i_hdr +
214 sizeof(struct rds_header) -
215 tc->t_tinc_hdr_rem,
216 to_copy);
217 tc->t_tinc_hdr_rem -= to_copy;
218 left -= to_copy;
219 offset += to_copy;
220
221 if (tc->t_tinc_hdr_rem == 0) {
222 /* could be 0 for a 0 len message */
223 tc->t_tinc_data_rem =
224 be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
225 }
226 }
227
228 if (left && tc->t_tinc_data_rem) {
229 clone = skb_clone(skb, arg->gfp);
230 if (clone == NULL) {
231 desc->error = -ENOMEM;
232 goto out;
233 }
234
235 to_copy = min(tc->t_tinc_data_rem, left);
236 pskb_pull(clone, offset);
237 pskb_trim(clone, to_copy);
238 skb_queue_tail(&tinc->ti_skb_list, clone);
239
240 rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
241 "clone %p data %p len %d\n",
242 skb, skb->data, skb->len, offset, to_copy,
243 clone, clone->data, clone->len);
244
245 tc->t_tinc_data_rem -= to_copy;
246 left -= to_copy;
247 offset += to_copy;
248 }
249
250 if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
251 if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
252 rds_tcp_cong_recv(conn, tinc);
253 else
254 rds_recv_incoming(conn, conn->c_faddr,
255 conn->c_laddr, &tinc->ti_inc,
256 arg->gfp, arg->km);
257
258 tc->t_tinc_hdr_rem = sizeof(struct rds_header);
259 tc->t_tinc_data_rem = 0;
260 tc->t_tinc = NULL;
261 rds_inc_put(&tinc->ti_inc);
262 tinc = NULL;
263 }
264 }
265out:
266 rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
267 len, left, skb->len,
268 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
269 return len - left;
270}
271
272/* the caller has to hold the sock lock */
273int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km)
274{
275 struct rds_tcp_connection *tc = conn->c_transport_data;
276 struct socket *sock = tc->t_sock;
277 read_descriptor_t desc;
278 struct rds_tcp_desc_arg arg;
279
280 /* It's like glib in the kernel! */
281 arg.conn = conn;
282 arg.gfp = gfp;
283 arg.km = km;
284 desc.arg.data = &arg;
285 desc.error = 0;
286 desc.count = 1; /* give more than one skb per call */
287
288 tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
289 rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
290 desc.error);
291
292 return desc.error;
293}
294
295/*
296 * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
297 * data_ready.
298 *
299 * if we fail to allocate we're in trouble.. blindly wait some time before
300 * trying again to see if the VM can free up something for us.
301 */
302int rds_tcp_recv(struct rds_connection *conn)
303{
304 struct rds_tcp_connection *tc = conn->c_transport_data;
305 struct socket *sock = tc->t_sock;
306 int ret = 0;
307
308 rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
309
310 lock_sock(sock->sk);
311 ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0);
312 release_sock(sock->sk);
313
314 return ret;
315}
316
317void rds_tcp_data_ready(struct sock *sk, int bytes)
318{
319 void (*ready)(struct sock *sk, int bytes);
320 struct rds_connection *conn;
321 struct rds_tcp_connection *tc;
322
323 rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
324
325 read_lock(&sk->sk_callback_lock);
326 conn = sk->sk_user_data;
327 if (conn == NULL) { /* check for teardown race */
328 ready = sk->sk_data_ready;
329 goto out;
330 }
331
332 tc = conn->c_transport_data;
333 ready = tc->t_orig_data_ready;
334 rds_tcp_stats_inc(s_tcp_data_ready_calls);
335
336 if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
337 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
338out:
339 read_unlock(&sk->sk_callback_lock);
340 ready(sk, bytes);
341}
342
343int __init rds_tcp_recv_init(void)
344{
345 rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
346 sizeof(struct rds_tcp_incoming),
347 0, 0, NULL);
348 if (rds_tcp_incoming_slab == NULL)
349 return -ENOMEM;
350 return 0;
351}
352
353void rds_tcp_recv_exit(void)
354{
355 kmem_cache_destroy(rds_tcp_incoming_slab);
356}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 000000000000..ab545e0cd5d6
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,263 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <net/tcp.h>
36
37#include "rds.h"
38#include "tcp.h"
39
40static void rds_tcp_cork(struct socket *sock, int val)
41{
42 mm_segment_t oldfs;
43
44 oldfs = get_fs();
45 set_fs(KERNEL_DS);
46 sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
47 sizeof(val));
48 set_fs(oldfs);
49}
50
51void rds_tcp_xmit_prepare(struct rds_connection *conn)
52{
53 struct rds_tcp_connection *tc = conn->c_transport_data;
54
55 rds_tcp_cork(tc->t_sock, 1);
56}
57
58void rds_tcp_xmit_complete(struct rds_connection *conn)
59{
60 struct rds_tcp_connection *tc = conn->c_transport_data;
61
62 rds_tcp_cork(tc->t_sock, 0);
63}
64
65/* the core send_sem serializes this with other xmit and shutdown */
66int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
67{
68 struct kvec vec = {
69 .iov_base = data,
70 .iov_len = len,
71 };
72 struct msghdr msg = {
73 .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
74 };
75
76 return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
77}
78
79/* the core send_sem serializes this with other xmit and shutdown */
80int rds_tcp_xmit_cong_map(struct rds_connection *conn,
81 struct rds_cong_map *map, unsigned long offset)
82{
83 static struct rds_header rds_tcp_map_header = {
84 .h_flags = RDS_FLAG_CONG_BITMAP,
85 };
86 struct rds_tcp_connection *tc = conn->c_transport_data;
87 unsigned long i;
88 int ret;
89 int copied = 0;
90
91 /* Some problem claims cpu_to_be32(constant) isn't a constant. */
92 rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
93
94 if (offset < sizeof(struct rds_header)) {
95 ret = rds_tcp_sendmsg(tc->t_sock,
96 (void *)&rds_tcp_map_header + offset,
97 sizeof(struct rds_header) - offset);
98 if (ret <= 0)
99 return ret;
100 offset += ret;
101 copied = ret;
102 if (offset < sizeof(struct rds_header))
103 return ret;
104 }
105
106 offset -= sizeof(struct rds_header);
107 i = offset / PAGE_SIZE;
108 offset = offset % PAGE_SIZE;
109 BUG_ON(i >= RDS_CONG_MAP_PAGES);
110
111 do {
112 ret = tc->t_sock->ops->sendpage(tc->t_sock,
113 virt_to_page(map->m_page_addrs[i]),
114 offset, PAGE_SIZE - offset,
115 MSG_DONTWAIT);
116 if (ret <= 0)
117 break;
118 copied += ret;
119 offset += ret;
120 if (offset == PAGE_SIZE) {
121 offset = 0;
122 i++;
123 }
124 } while (i < RDS_CONG_MAP_PAGES);
125
126 return copied ? copied : ret;
127}
128
129/* the core send_sem serializes this with other xmit and shutdown */
130int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
131 unsigned int hdr_off, unsigned int sg, unsigned int off)
132{
133 struct rds_tcp_connection *tc = conn->c_transport_data;
134 int done = 0;
135 int ret = 0;
136
137 if (hdr_off == 0) {
138 /*
139 * m_ack_seq is set to the sequence number of the last byte of
140 * header and data. see rds_tcp_is_acked().
141 */
142 tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
143 rm->m_ack_seq = tc->t_last_sent_nxt +
144 sizeof(struct rds_header) +
145 be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
146 smp_mb__before_clear_bit();
147 set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
148 tc->t_last_expected_una = rm->m_ack_seq + 1;
149
150 rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
151 rm, rds_tcp_snd_nxt(tc),
152 (unsigned long long)rm->m_ack_seq);
153 }
154
155 if (hdr_off < sizeof(struct rds_header)) {
156 /* see rds_tcp_write_space() */
157 set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
158
159 ret = rds_tcp_sendmsg(tc->t_sock,
160 (void *)&rm->m_inc.i_hdr + hdr_off,
161 sizeof(rm->m_inc.i_hdr) - hdr_off);
162 if (ret < 0)
163 goto out;
164 done += ret;
165 if (hdr_off + done != sizeof(struct rds_header))
166 goto out;
167 }
168
169 while (sg < rm->m_nents) {
170 ret = tc->t_sock->ops->sendpage(tc->t_sock,
171 sg_page(&rm->m_sg[sg]),
172 rm->m_sg[sg].offset + off,
173 rm->m_sg[sg].length - off,
174 MSG_DONTWAIT|MSG_NOSIGNAL);
175 rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]),
176 rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off,
177 ret);
178 if (ret <= 0)
179 break;
180
181 off += ret;
182 done += ret;
183 if (off == rm->m_sg[sg].length) {
184 off = 0;
185 sg++;
186 }
187 }
188
189out:
190 if (ret <= 0) {
191 /* write_space will hit after EAGAIN, all else fatal */
192 if (ret == -EAGAIN) {
193 rds_tcp_stats_inc(s_tcp_sndbuf_full);
194 ret = 0;
195 } else {
196 printk(KERN_WARNING "RDS/tcp: send to %u.%u.%u.%u "
197 "returned %d, disconnecting and reconnecting\n",
198 NIPQUAD(conn->c_faddr), ret);
199 rds_conn_drop(conn);
200 }
201 }
202 if (done == 0)
203 done = ret;
204 return done;
205}
206
207/*
208 * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
209 * last byte of the message, including the header. This means that the
210 * entire message has been received if rm->m_ack_seq is "before" the next
211 * unacked byte of the TCP sequence space. We have to do very careful
212 * wrapping 32bit comparisons here.
213 */
214static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
215{
216 if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
217 return 0;
218 return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
219}
220
221void rds_tcp_write_space(struct sock *sk)
222{
223 void (*write_space)(struct sock *sk);
224 struct rds_connection *conn;
225 struct rds_tcp_connection *tc;
226
227 read_lock(&sk->sk_callback_lock);
228 conn = sk->sk_user_data;
229 if (conn == NULL) {
230 write_space = sk->sk_write_space;
231 goto out;
232 }
233
234 tc = conn->c_transport_data;
235 rdsdebug("write_space for tc %p\n", tc);
236 write_space = tc->t_orig_write_space;
237 rds_tcp_stats_inc(s_tcp_write_space_calls);
238
239 rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
240 tc->t_last_seen_una = rds_tcp_snd_una(tc);
241 rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
242
243 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
244out:
245 read_unlock(&sk->sk_callback_lock);
246
247 /*
248 * write_space is only called when data leaves tcp's send queue if
249 * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put
250 * data in tcp's send queue because we use write_space to parse the
251 * sequence numbers and notice that rds messages have been fully
252 * received.
253 *
254 * tcp's write_space clears SOCK_NOSPACE if the send queue has more
255 * than a certain amount of space. So we need to set it again *after*
256 * we call tcp's write_space or else we might only get called on the
257 * first of a series of incoming tcp acks.
258 */
259 write_space(sk);
260
261 if (sk->sk_socket)
262 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
263}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 000000000000..d5898d03cd68
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38#include "tcp.h"
39
40DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
41 ____cacheline_aligned;
42
43static const char const *rds_tcp_stat_names[] = {
44 "tcp_data_ready_calls",
45 "tcp_write_space_calls",
46 "tcp_sndbuf_full",
47 "tcp_connect_raced",
48 "tcp_listen_closed_stale",
49};
50
51unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
52 unsigned int avail)
53{
54 struct rds_tcp_statistics stats = {0, };
55 uint64_t *src;
56 uint64_t *sum;
57 size_t i;
58 int cpu;
59
60 if (avail < ARRAY_SIZE(rds_tcp_stat_names))
61 goto out;
62
63 for_each_online_cpu(cpu) {
64 src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
65 sum = (uint64_t *)&stats;
66 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
67 *(sum++) += *(src++);
68 }
69
70 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
71 ARRAY_SIZE(rds_tcp_stat_names));
72out:
73 return ARRAY_SIZE(rds_tcp_stat_names);
74}