diff options
| author | Andy Grover <andy.grover@oracle.com> | 2009-08-21 08:28:31 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2009-08-23 22:13:02 -0400 |
| commit | 70041088e3b976627ba9a183b812f39ef8a9ba0e (patch) | |
| tree | bad7b11763d7b02b185bd705fe5ed292397cbc7a /net/rds | |
| parent | 7d6fd5e7e97a2188d56441e4e96494c21c5994a7 (diff) | |
RDS: Add TCP transport to RDS
This code allows RDS to be tunneled over a TCP connection.
RDMA operations are disabled when using TCP transport,
but this frees RDS from the IB/RDMA stack dependency, and allows
it to be used with standard Ethernet adapters, or in a VM.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds')
| -rw-r--r-- | net/rds/tcp.c | 319 | ||||
| -rw-r--r-- | net/rds/tcp.h | 93 | ||||
| -rw-r--r-- | net/rds/tcp_connect.c | 153 | ||||
| -rw-r--r-- | net/rds/tcp_listen.c | 199 | ||||
| -rw-r--r-- | net/rds/tcp_recv.c | 356 | ||||
| -rw-r--r-- | net/rds/tcp_send.c | 263 | ||||
| -rw-r--r-- | net/rds/tcp_stats.c | 74 |
7 files changed, 1457 insertions, 0 deletions
diff --git a/net/rds/tcp.c b/net/rds/tcp.c new file mode 100644 index 000000000000..e0ac9009db1a --- /dev/null +++ b/net/rds/tcp.c | |||
| @@ -0,0 +1,319 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <linux/in.h> | ||
| 35 | #include <net/tcp.h> | ||
| 36 | |||
| 37 | #include "rds.h" | ||
| 38 | #include "tcp.h" | ||
| 39 | |||
| 40 | /* only for info exporting */ | ||
| 41 | static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); | ||
| 42 | static LIST_HEAD(rds_tcp_tc_list); | ||
| 43 | unsigned int rds_tcp_tc_count; | ||
| 44 | |||
| 45 | /* Track rds_tcp_connection structs so they can be cleaned up */ | ||
| 46 | static DEFINE_SPINLOCK(rds_tcp_conn_lock); | ||
| 47 | static LIST_HEAD(rds_tcp_conn_list); | ||
| 48 | |||
| 49 | static struct kmem_cache *rds_tcp_conn_slab; | ||
| 50 | |||
| 51 | #define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) | ||
| 52 | |||
| 53 | /* doing it this way avoids calling tcp_sk() */ | ||
| 54 | void rds_tcp_nonagle(struct socket *sock) | ||
| 55 | { | ||
| 56 | mm_segment_t oldfs = get_fs(); | ||
| 57 | int val = 1; | ||
| 58 | |||
| 59 | set_fs(KERNEL_DS); | ||
| 60 | sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, | ||
| 61 | sizeof(val)); | ||
| 62 | set_fs(oldfs); | ||
| 63 | } | ||
| 64 | |||
| 65 | void rds_tcp_tune(struct socket *sock) | ||
| 66 | { | ||
| 67 | struct sock *sk = sock->sk; | ||
| 68 | |||
| 69 | rds_tcp_nonagle(sock); | ||
| 70 | |||
| 71 | /* | ||
| 72 | * We're trying to saturate gigabit with the default, | ||
| 73 | * see svc_sock_setbufsize(). | ||
| 74 | */ | ||
| 75 | lock_sock(sk); | ||
| 76 | sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; | ||
| 77 | sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; | ||
| 78 | sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; | ||
| 79 | release_sock(sk); | ||
| 80 | } | ||
| 81 | |||
| 82 | u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) | ||
| 83 | { | ||
| 84 | return tcp_sk(tc->t_sock->sk)->snd_nxt; | ||
| 85 | } | ||
| 86 | |||
| 87 | u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) | ||
| 88 | { | ||
| 89 | return tcp_sk(tc->t_sock->sk)->snd_una; | ||
| 90 | } | ||
| 91 | |||
| 92 | void rds_tcp_restore_callbacks(struct socket *sock, | ||
| 93 | struct rds_tcp_connection *tc) | ||
| 94 | { | ||
| 95 | rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc); | ||
| 96 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
| 97 | |||
| 98 | /* done under the callback_lock to serialize with write_space */ | ||
| 99 | spin_lock(&rds_tcp_tc_list_lock); | ||
| 100 | list_del_init(&tc->t_list_item); | ||
| 101 | rds_tcp_tc_count--; | ||
| 102 | spin_unlock(&rds_tcp_tc_list_lock); | ||
| 103 | |||
| 104 | tc->t_sock = NULL; | ||
| 105 | |||
| 106 | sock->sk->sk_write_space = tc->t_orig_write_space; | ||
| 107 | sock->sk->sk_data_ready = tc->t_orig_data_ready; | ||
| 108 | sock->sk->sk_state_change = tc->t_orig_state_change; | ||
| 109 | sock->sk->sk_user_data = NULL; | ||
| 110 | |||
| 111 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
| 112 | } | ||
| 113 | |||
| 114 | /* | ||
| 115 | * This is the only path that sets tc->t_sock. Send and receive trust that | ||
| 116 | * it is set. The RDS_CONN_CONNECTED bit protects those paths from being | ||
| 117 | * called while it isn't set. | ||
| 118 | */ | ||
| 119 | void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) | ||
| 120 | { | ||
| 121 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 122 | |||
| 123 | rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); | ||
| 124 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
| 125 | |||
| 126 | /* done under the callback_lock to serialize with write_space */ | ||
| 127 | spin_lock(&rds_tcp_tc_list_lock); | ||
| 128 | list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); | ||
| 129 | rds_tcp_tc_count++; | ||
| 130 | spin_unlock(&rds_tcp_tc_list_lock); | ||
| 131 | |||
| 132 | /* accepted sockets need our listen data ready undone */ | ||
| 133 | if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) | ||
| 134 | sock->sk->sk_data_ready = sock->sk->sk_user_data; | ||
| 135 | |||
| 136 | tc->t_sock = sock; | ||
| 137 | tc->conn = conn; | ||
| 138 | tc->t_orig_data_ready = sock->sk->sk_data_ready; | ||
| 139 | tc->t_orig_write_space = sock->sk->sk_write_space; | ||
| 140 | tc->t_orig_state_change = sock->sk->sk_state_change; | ||
| 141 | |||
| 142 | sock->sk->sk_user_data = conn; | ||
| 143 | sock->sk->sk_data_ready = rds_tcp_data_ready; | ||
| 144 | sock->sk->sk_write_space = rds_tcp_write_space; | ||
| 145 | sock->sk->sk_state_change = rds_tcp_state_change; | ||
| 146 | |||
| 147 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
| 148 | } | ||
| 149 | |||
| 150 | static void rds_tcp_tc_info(struct socket *sock, unsigned int len, | ||
| 151 | struct rds_info_iterator *iter, | ||
| 152 | struct rds_info_lengths *lens) | ||
| 153 | { | ||
| 154 | struct rds_info_tcp_socket tsinfo; | ||
| 155 | struct rds_tcp_connection *tc; | ||
| 156 | unsigned long flags; | ||
| 157 | struct sockaddr_in sin; | ||
| 158 | int sinlen; | ||
| 159 | |||
| 160 | spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); | ||
| 161 | |||
| 162 | if (len / sizeof(tsinfo) < rds_tcp_tc_count) | ||
| 163 | goto out; | ||
| 164 | |||
| 165 | list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { | ||
| 166 | |||
| 167 | sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); | ||
| 168 | tsinfo.local_addr = sin.sin_addr.s_addr; | ||
| 169 | tsinfo.local_port = sin.sin_port; | ||
| 170 | sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); | ||
| 171 | tsinfo.peer_addr = sin.sin_addr.s_addr; | ||
| 172 | tsinfo.peer_port = sin.sin_port; | ||
| 173 | |||
| 174 | tsinfo.hdr_rem = tc->t_tinc_hdr_rem; | ||
| 175 | tsinfo.data_rem = tc->t_tinc_data_rem; | ||
| 176 | tsinfo.last_sent_nxt = tc->t_last_sent_nxt; | ||
| 177 | tsinfo.last_expected_una = tc->t_last_expected_una; | ||
| 178 | tsinfo.last_seen_una = tc->t_last_seen_una; | ||
| 179 | |||
| 180 | rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); | ||
| 181 | } | ||
| 182 | |||
| 183 | out: | ||
| 184 | lens->nr = rds_tcp_tc_count; | ||
| 185 | lens->each = sizeof(tsinfo); | ||
| 186 | |||
| 187 | spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); | ||
| 188 | } | ||
| 189 | |||
| 190 | static int rds_tcp_laddr_check(__be32 addr) | ||
| 191 | { | ||
| 192 | if (inet_addr_type(&init_net, addr) == RTN_LOCAL) | ||
| 193 | return 0; | ||
| 194 | return -EADDRNOTAVAIL; | ||
| 195 | } | ||
| 196 | |||
| 197 | static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) | ||
| 198 | { | ||
| 199 | struct rds_tcp_connection *tc; | ||
| 200 | |||
| 201 | tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); | ||
| 202 | if (tc == NULL) | ||
| 203 | return -ENOMEM; | ||
| 204 | |||
| 205 | tc->t_sock = NULL; | ||
| 206 | tc->t_tinc = NULL; | ||
| 207 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
| 208 | tc->t_tinc_data_rem = 0; | ||
| 209 | |||
| 210 | conn->c_transport_data = tc; | ||
| 211 | |||
| 212 | spin_lock_irq(&rds_tcp_conn_lock); | ||
| 213 | list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); | ||
| 214 | spin_unlock_irq(&rds_tcp_conn_lock); | ||
| 215 | |||
| 216 | rdsdebug("alloced tc %p\n", conn->c_transport_data); | ||
| 217 | return 0; | ||
| 218 | } | ||
| 219 | |||
| 220 | static void rds_tcp_conn_free(void *arg) | ||
| 221 | { | ||
| 222 | struct rds_tcp_connection *tc = arg; | ||
| 223 | rdsdebug("freeing tc %p\n", tc); | ||
| 224 | kmem_cache_free(rds_tcp_conn_slab, tc); | ||
| 225 | } | ||
| 226 | |||
| 227 | static void rds_tcp_destroy_conns(void) | ||
| 228 | { | ||
| 229 | struct rds_tcp_connection *tc, *_tc; | ||
| 230 | LIST_HEAD(tmp_list); | ||
| 231 | |||
| 232 | /* avoid calling conn_destroy with irqs off */ | ||
| 233 | spin_lock_irq(&rds_tcp_conn_lock); | ||
| 234 | list_splice(&rds_tcp_conn_list, &tmp_list); | ||
| 235 | INIT_LIST_HEAD(&rds_tcp_conn_list); | ||
| 236 | spin_unlock_irq(&rds_tcp_conn_lock); | ||
| 237 | |||
| 238 | list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { | ||
| 239 | if (tc->conn->c_passive) | ||
| 240 | rds_conn_destroy(tc->conn->c_passive); | ||
| 241 | rds_conn_destroy(tc->conn); | ||
| 242 | } | ||
| 243 | } | ||
| 244 | |||
| 245 | void rds_tcp_exit(void) | ||
| 246 | { | ||
| 247 | rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); | ||
| 248 | rds_tcp_listen_stop(); | ||
| 249 | rds_tcp_destroy_conns(); | ||
| 250 | rds_trans_unregister(&rds_tcp_transport); | ||
| 251 | rds_tcp_recv_exit(); | ||
| 252 | kmem_cache_destroy(rds_tcp_conn_slab); | ||
| 253 | } | ||
| 254 | module_exit(rds_tcp_exit); | ||
| 255 | |||
| 256 | struct rds_transport rds_tcp_transport = { | ||
| 257 | .laddr_check = rds_tcp_laddr_check, | ||
| 258 | .xmit_prepare = rds_tcp_xmit_prepare, | ||
| 259 | .xmit_complete = rds_tcp_xmit_complete, | ||
| 260 | .xmit_cong_map = rds_tcp_xmit_cong_map, | ||
| 261 | .xmit = rds_tcp_xmit, | ||
| 262 | .recv = rds_tcp_recv, | ||
| 263 | .conn_alloc = rds_tcp_conn_alloc, | ||
| 264 | .conn_free = rds_tcp_conn_free, | ||
| 265 | .conn_connect = rds_tcp_conn_connect, | ||
| 266 | .conn_shutdown = rds_tcp_conn_shutdown, | ||
| 267 | .inc_copy_to_user = rds_tcp_inc_copy_to_user, | ||
| 268 | .inc_purge = rds_tcp_inc_purge, | ||
| 269 | .inc_free = rds_tcp_inc_free, | ||
| 270 | .stats_info_copy = rds_tcp_stats_info_copy, | ||
| 271 | .exit = rds_tcp_exit, | ||
| 272 | .t_owner = THIS_MODULE, | ||
| 273 | .t_name = "tcp", | ||
| 274 | .t_prefer_loopback = 1, | ||
| 275 | }; | ||
| 276 | |||
| 277 | int __init rds_tcp_init(void) | ||
| 278 | { | ||
| 279 | int ret; | ||
| 280 | |||
| 281 | rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", | ||
| 282 | sizeof(struct rds_tcp_connection), | ||
| 283 | 0, 0, NULL); | ||
| 284 | if (rds_tcp_conn_slab == NULL) { | ||
| 285 | ret = -ENOMEM; | ||
| 286 | goto out; | ||
| 287 | } | ||
| 288 | |||
| 289 | ret = rds_tcp_recv_init(); | ||
| 290 | if (ret) | ||
| 291 | goto out_slab; | ||
| 292 | |||
| 293 | ret = rds_trans_register(&rds_tcp_transport); | ||
| 294 | if (ret) | ||
| 295 | goto out_recv; | ||
| 296 | |||
| 297 | ret = rds_tcp_listen_init(); | ||
| 298 | if (ret) | ||
| 299 | goto out_register; | ||
| 300 | |||
| 301 | rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); | ||
| 302 | |||
| 303 | goto out; | ||
| 304 | |||
| 305 | out_register: | ||
| 306 | rds_trans_unregister(&rds_tcp_transport); | ||
| 307 | out_recv: | ||
| 308 | rds_tcp_recv_exit(); | ||
| 309 | out_slab: | ||
| 310 | kmem_cache_destroy(rds_tcp_conn_slab); | ||
| 311 | out: | ||
| 312 | return ret; | ||
| 313 | } | ||
| 314 | module_init(rds_tcp_init); | ||
| 315 | |||
| 316 | MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); | ||
| 317 | MODULE_DESCRIPTION("RDS: TCP transport"); | ||
| 318 | MODULE_LICENSE("Dual BSD/GPL"); | ||
| 319 | |||
diff --git a/net/rds/tcp.h b/net/rds/tcp.h new file mode 100644 index 000000000000..844fa6b9cf5a --- /dev/null +++ b/net/rds/tcp.h | |||
| @@ -0,0 +1,93 @@ | |||
| 1 | #ifndef _RDS_TCP_H | ||
| 2 | #define _RDS_TCP_H | ||
| 3 | |||
| 4 | #define RDS_TCP_PORT 16385 | ||
| 5 | |||
| 6 | struct rds_tcp_incoming { | ||
| 7 | struct rds_incoming ti_inc; | ||
| 8 | struct sk_buff_head ti_skb_list; | ||
| 9 | }; | ||
| 10 | |||
| 11 | struct rds_tcp_connection { | ||
| 12 | |||
| 13 | struct list_head t_tcp_node; | ||
| 14 | struct rds_connection *conn; | ||
| 15 | struct socket *t_sock; | ||
| 16 | void *t_orig_write_space; | ||
| 17 | void *t_orig_data_ready; | ||
| 18 | void *t_orig_state_change; | ||
| 19 | |||
| 20 | struct rds_tcp_incoming *t_tinc; | ||
| 21 | size_t t_tinc_hdr_rem; | ||
| 22 | size_t t_tinc_data_rem; | ||
| 23 | |||
| 24 | /* XXX error report? */ | ||
| 25 | struct work_struct t_conn_w; | ||
| 26 | struct work_struct t_send_w; | ||
| 27 | struct work_struct t_down_w; | ||
| 28 | struct work_struct t_recv_w; | ||
| 29 | |||
| 30 | /* for info exporting only */ | ||
| 31 | struct list_head t_list_item; | ||
| 32 | u32 t_last_sent_nxt; | ||
| 33 | u32 t_last_expected_una; | ||
| 34 | u32 t_last_seen_una; | ||
| 35 | }; | ||
| 36 | |||
| 37 | struct rds_tcp_statistics { | ||
| 38 | uint64_t s_tcp_data_ready_calls; | ||
| 39 | uint64_t s_tcp_write_space_calls; | ||
| 40 | uint64_t s_tcp_sndbuf_full; | ||
| 41 | uint64_t s_tcp_connect_raced; | ||
| 42 | uint64_t s_tcp_listen_closed_stale; | ||
| 43 | }; | ||
| 44 | |||
| 45 | /* tcp.c */ | ||
| 46 | int __init rds_tcp_init(void); | ||
| 47 | void rds_tcp_exit(void); | ||
| 48 | void rds_tcp_tune(struct socket *sock); | ||
| 49 | void rds_tcp_nonagle(struct socket *sock); | ||
| 50 | void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); | ||
| 51 | void rds_tcp_restore_callbacks(struct socket *sock, | ||
| 52 | struct rds_tcp_connection *tc); | ||
| 53 | u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); | ||
| 54 | u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); | ||
| 55 | u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); | ||
| 56 | extern struct rds_transport rds_tcp_transport; | ||
| 57 | |||
| 58 | /* tcp_connect.c */ | ||
| 59 | int rds_tcp_conn_connect(struct rds_connection *conn); | ||
| 60 | void rds_tcp_conn_shutdown(struct rds_connection *conn); | ||
| 61 | void rds_tcp_state_change(struct sock *sk); | ||
| 62 | |||
| 63 | /* tcp_listen.c */ | ||
| 64 | int __init rds_tcp_listen_init(void); | ||
| 65 | void rds_tcp_listen_stop(void); | ||
| 66 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes); | ||
| 67 | |||
| 68 | /* tcp_recv.c */ | ||
| 69 | int __init rds_tcp_recv_init(void); | ||
| 70 | void rds_tcp_recv_exit(void); | ||
| 71 | void rds_tcp_data_ready(struct sock *sk, int bytes); | ||
| 72 | int rds_tcp_recv(struct rds_connection *conn); | ||
| 73 | void rds_tcp_inc_purge(struct rds_incoming *inc); | ||
| 74 | void rds_tcp_inc_free(struct rds_incoming *inc); | ||
| 75 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | ||
| 76 | size_t size); | ||
| 77 | |||
| 78 | /* tcp_send.c */ | ||
| 79 | void rds_tcp_xmit_prepare(struct rds_connection *conn); | ||
| 80 | void rds_tcp_xmit_complete(struct rds_connection *conn); | ||
| 81 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
| 82 | unsigned int hdr_off, unsigned int sg, unsigned int off); | ||
| 83 | void rds_tcp_write_space(struct sock *sk); | ||
| 84 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
| 85 | struct rds_cong_map *map, unsigned long offset); | ||
| 86 | |||
| 87 | /* tcp_stats.c */ | ||
| 88 | DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); | ||
| 89 | #define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member) | ||
| 90 | unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, | ||
| 91 | unsigned int avail); | ||
| 92 | |||
| 93 | #endif | ||
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c new file mode 100644 index 000000000000..211522f9a9a2 --- /dev/null +++ b/net/rds/tcp_connect.c | |||
| @@ -0,0 +1,153 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <linux/in.h> | ||
| 35 | #include <net/tcp.h> | ||
| 36 | |||
| 37 | #include "rds.h" | ||
| 38 | #include "tcp.h" | ||
| 39 | |||
| 40 | void rds_tcp_state_change(struct sock *sk) | ||
| 41 | { | ||
| 42 | void (*state_change)(struct sock *sk); | ||
| 43 | struct rds_connection *conn; | ||
| 44 | struct rds_tcp_connection *tc; | ||
| 45 | |||
| 46 | read_lock(&sk->sk_callback_lock); | ||
| 47 | conn = sk->sk_user_data; | ||
| 48 | if (conn == NULL) { | ||
| 49 | state_change = sk->sk_state_change; | ||
| 50 | goto out; | ||
| 51 | } | ||
| 52 | tc = conn->c_transport_data; | ||
| 53 | state_change = tc->t_orig_state_change; | ||
| 54 | |||
| 55 | rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); | ||
| 56 | |||
| 57 | switch(sk->sk_state) { | ||
| 58 | /* ignore connecting sockets as they make progress */ | ||
| 59 | case TCP_SYN_SENT: | ||
| 60 | case TCP_SYN_RECV: | ||
| 61 | break; | ||
| 62 | case TCP_ESTABLISHED: | ||
| 63 | rds_connect_complete(conn); | ||
| 64 | break; | ||
| 65 | case TCP_CLOSE: | ||
| 66 | rds_conn_drop(conn); | ||
| 67 | default: | ||
| 68 | break; | ||
| 69 | } | ||
| 70 | out: | ||
| 71 | read_unlock(&sk->sk_callback_lock); | ||
| 72 | state_change(sk); | ||
| 73 | } | ||
| 74 | |||
| 75 | int rds_tcp_conn_connect(struct rds_connection *conn) | ||
| 76 | { | ||
| 77 | struct socket *sock = NULL; | ||
| 78 | struct sockaddr_in src, dest; | ||
| 79 | int ret; | ||
| 80 | |||
| 81 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | ||
| 82 | if (ret < 0) | ||
| 83 | goto out; | ||
| 84 | |||
| 85 | rds_tcp_tune(sock); | ||
| 86 | |||
| 87 | src.sin_family = AF_INET; | ||
| 88 | src.sin_addr.s_addr = (__force u32)conn->c_laddr; | ||
| 89 | src.sin_port = (__force u16)htons(0); | ||
| 90 | |||
| 91 | ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); | ||
| 92 | if (ret) { | ||
| 93 | rdsdebug("bind failed with %d at address %u.%u.%u.%u\n", | ||
| 94 | ret, NIPQUAD(conn->c_laddr)); | ||
| 95 | goto out; | ||
| 96 | } | ||
| 97 | |||
| 98 | dest.sin_family = AF_INET; | ||
| 99 | dest.sin_addr.s_addr = (__force u32)conn->c_faddr; | ||
| 100 | dest.sin_port = (__force u16)htons(RDS_TCP_PORT); | ||
| 101 | |||
| 102 | /* | ||
| 103 | * once we call connect() we can start getting callbacks and they | ||
| 104 | * own the socket | ||
| 105 | */ | ||
| 106 | rds_tcp_set_callbacks(sock, conn); | ||
| 107 | ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), | ||
| 108 | O_NONBLOCK); | ||
| 109 | sock = NULL; | ||
| 110 | |||
| 111 | rdsdebug("connect to address %u.%u.%u.%u returned %d\n", | ||
| 112 | NIPQUAD(conn->c_faddr), ret); | ||
| 113 | if (ret == -EINPROGRESS) | ||
| 114 | ret = 0; | ||
| 115 | |||
| 116 | out: | ||
| 117 | if (sock) | ||
| 118 | sock_release(sock); | ||
| 119 | return ret; | ||
| 120 | } | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Before killing the tcp socket this needs to serialize with callbacks. The | ||
| 124 | * caller has already grabbed the sending sem so we're serialized with other | ||
| 125 | * senders. | ||
| 126 | * | ||
| 127 | * TCP calls the callbacks with the sock lock so we hold it while we reset the | ||
| 128 | * callbacks to those set by TCP. Our callbacks won't execute again once we | ||
| 129 | * hold the sock lock. | ||
| 130 | */ | ||
| 131 | void rds_tcp_conn_shutdown(struct rds_connection *conn) | ||
| 132 | { | ||
| 133 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 134 | struct socket *sock = tc->t_sock; | ||
| 135 | |||
| 136 | rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); | ||
| 137 | |||
| 138 | if (sock) { | ||
| 139 | sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); | ||
| 140 | lock_sock(sock->sk); | ||
| 141 | rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ | ||
| 142 | |||
| 143 | release_sock(sock->sk); | ||
| 144 | sock_release(sock); | ||
| 145 | }; | ||
| 146 | |||
| 147 | if (tc->t_tinc) { | ||
| 148 | rds_inc_put(&tc->t_tinc->ti_inc); | ||
| 149 | tc->t_tinc = NULL; | ||
| 150 | } | ||
| 151 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
| 152 | tc->t_tinc_data_rem = 0; | ||
| 153 | } | ||
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c new file mode 100644 index 000000000000..24b743eb0b1b --- /dev/null +++ b/net/rds/tcp_listen.c | |||
| @@ -0,0 +1,199 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <linux/in.h> | ||
| 35 | #include <net/tcp.h> | ||
| 36 | |||
| 37 | #include "rds.h" | ||
| 38 | #include "tcp.h" | ||
| 39 | |||
| 40 | /* | ||
| 41 | * cheesy, but simple.. | ||
| 42 | */ | ||
| 43 | static void rds_tcp_accept_worker(struct work_struct *work); | ||
| 44 | static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); | ||
| 45 | static struct socket *rds_tcp_listen_sock; | ||
| 46 | |||
| 47 | static int rds_tcp_accept_one(struct socket *sock) | ||
| 48 | { | ||
| 49 | struct socket *new_sock = NULL; | ||
| 50 | struct rds_connection *conn; | ||
| 51 | int ret; | ||
| 52 | struct inet_sock *inet; | ||
| 53 | |||
| 54 | ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, | ||
| 55 | sock->sk->sk_protocol, &new_sock); | ||
| 56 | if (ret) | ||
| 57 | goto out; | ||
| 58 | |||
| 59 | new_sock->type = sock->type; | ||
| 60 | new_sock->ops = sock->ops; | ||
| 61 | ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); | ||
| 62 | if (ret < 0) | ||
| 63 | goto out; | ||
| 64 | |||
| 65 | rds_tcp_tune(new_sock); | ||
| 66 | |||
| 67 | inet = inet_sk(new_sock->sk); | ||
| 68 | |||
| 69 | rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", | ||
| 70 | NIPQUAD(inet->saddr), ntohs(inet->sport), | ||
| 71 | NIPQUAD(inet->daddr), ntohs(inet->dport)); | ||
| 72 | |||
| 73 | conn = rds_conn_create(inet->saddr, inet->daddr, &rds_tcp_transport, | ||
| 74 | GFP_KERNEL); | ||
| 75 | if (IS_ERR(conn)) { | ||
| 76 | ret = PTR_ERR(conn); | ||
| 77 | goto out; | ||
| 78 | } | ||
| 79 | |||
| 80 | /* | ||
| 81 | * see the comment above rds_queue_delayed_reconnect() | ||
| 82 | */ | ||
| 83 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { | ||
| 84 | if (rds_conn_state(conn) == RDS_CONN_UP) | ||
| 85 | rds_tcp_stats_inc(s_tcp_listen_closed_stale); | ||
| 86 | else | ||
| 87 | rds_tcp_stats_inc(s_tcp_connect_raced); | ||
| 88 | rds_conn_drop(conn); | ||
| 89 | ret = 0; | ||
| 90 | goto out; | ||
| 91 | } | ||
| 92 | |||
| 93 | rds_tcp_set_callbacks(new_sock, conn); | ||
| 94 | rds_connect_complete(conn); | ||
| 95 | new_sock = NULL; | ||
| 96 | ret = 0; | ||
| 97 | |||
| 98 | out: | ||
| 99 | if (new_sock) | ||
| 100 | sock_release(new_sock); | ||
| 101 | return ret; | ||
| 102 | } | ||
| 103 | |||
| 104 | static void rds_tcp_accept_worker(struct work_struct *work) | ||
| 105 | { | ||
| 106 | while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) | ||
| 107 | cond_resched(); | ||
| 108 | } | ||
| 109 | |||
| 110 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes) | ||
| 111 | { | ||
| 112 | void (*ready)(struct sock *sk, int bytes); | ||
| 113 | |||
| 114 | rdsdebug("listen data ready sk %p\n", sk); | ||
| 115 | |||
| 116 | read_lock(&sk->sk_callback_lock); | ||
| 117 | ready = sk->sk_user_data; | ||
| 118 | if (ready == NULL) { /* check for teardown race */ | ||
| 119 | ready = sk->sk_data_ready; | ||
| 120 | goto out; | ||
| 121 | } | ||
| 122 | |||
| 123 | /* | ||
| 124 | * ->sk_data_ready is also called for a newly established child socket | ||
| 125 | * before it has been accepted and the accepter has set up their | ||
| 126 | * data_ready.. we only want to queue listen work for our listening | ||
| 127 | * socket | ||
| 128 | */ | ||
| 129 | if (sk->sk_state == TCP_LISTEN) | ||
| 130 | queue_work(rds_wq, &rds_tcp_listen_work); | ||
| 131 | |||
| 132 | out: | ||
| 133 | read_unlock(&sk->sk_callback_lock); | ||
| 134 | ready(sk, bytes); | ||
| 135 | } | ||
| 136 | |||
| 137 | int __init rds_tcp_listen_init(void) | ||
| 138 | { | ||
| 139 | struct sockaddr_in sin; | ||
| 140 | struct socket *sock = NULL; | ||
| 141 | int ret; | ||
| 142 | |||
| 143 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | ||
| 144 | if (ret < 0) | ||
| 145 | goto out; | ||
| 146 | |||
| 147 | sock->sk->sk_reuse = 1; | ||
| 148 | rds_tcp_nonagle(sock); | ||
| 149 | |||
| 150 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
| 151 | sock->sk->sk_user_data = sock->sk->sk_data_ready; | ||
| 152 | sock->sk->sk_data_ready = rds_tcp_listen_data_ready; | ||
| 153 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
| 154 | |||
| 155 | sin.sin_family = PF_INET, | ||
| 156 | sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); | ||
| 157 | sin.sin_port = (__force u16)htons(RDS_TCP_PORT); | ||
| 158 | |||
| 159 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); | ||
| 160 | if (ret < 0) | ||
| 161 | goto out; | ||
| 162 | |||
| 163 | ret = sock->ops->listen(sock, 64); | ||
| 164 | if (ret < 0) | ||
| 165 | goto out; | ||
| 166 | |||
| 167 | rds_tcp_listen_sock = sock; | ||
| 168 | sock = NULL; | ||
| 169 | out: | ||
| 170 | if (sock) | ||
| 171 | sock_release(sock); | ||
| 172 | return ret; | ||
| 173 | } | ||
| 174 | |||
| 175 | void rds_tcp_listen_stop(void) | ||
| 176 | { | ||
| 177 | struct socket *sock = rds_tcp_listen_sock; | ||
| 178 | struct sock *sk; | ||
| 179 | |||
| 180 | if (sock == NULL) | ||
| 181 | return; | ||
| 182 | |||
| 183 | sk = sock->sk; | ||
| 184 | |||
| 185 | /* serialize with and prevent further callbacks */ | ||
| 186 | lock_sock(sk); | ||
| 187 | write_lock_bh(&sk->sk_callback_lock); | ||
| 188 | if (sk->sk_user_data) { | ||
| 189 | sk->sk_data_ready = sk->sk_user_data; | ||
| 190 | sk->sk_user_data = NULL; | ||
| 191 | } | ||
| 192 | write_unlock_bh(&sk->sk_callback_lock); | ||
| 193 | release_sock(sk); | ||
| 194 | |||
| 195 | /* wait for accepts to stop and close the socket */ | ||
| 196 | flush_workqueue(rds_wq); | ||
| 197 | sock_release(sock); | ||
| 198 | rds_tcp_listen_sock = NULL; | ||
| 199 | } | ||
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c new file mode 100644 index 000000000000..c00dafffbb5a --- /dev/null +++ b/net/rds/tcp_recv.c | |||
| @@ -0,0 +1,356 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <net/tcp.h> | ||
| 35 | |||
| 36 | #include "rds.h" | ||
| 37 | #include "tcp.h" | ||
| 38 | |||
| 39 | static struct kmem_cache *rds_tcp_incoming_slab; | ||
| 40 | |||
| 41 | void rds_tcp_inc_purge(struct rds_incoming *inc) | ||
| 42 | { | ||
| 43 | struct rds_tcp_incoming *tinc; | ||
| 44 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
| 45 | rdsdebug("purging tinc %p inc %p\n", tinc, inc); | ||
| 46 | skb_queue_purge(&tinc->ti_skb_list); | ||
| 47 | } | ||
| 48 | |||
| 49 | void rds_tcp_inc_free(struct rds_incoming *inc) | ||
| 50 | { | ||
| 51 | struct rds_tcp_incoming *tinc; | ||
| 52 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
| 53 | rds_tcp_inc_purge(inc); | ||
| 54 | rdsdebug("freeing tinc %p inc %p\n", tinc, inc); | ||
| 55 | kmem_cache_free(rds_tcp_incoming_slab, tinc); | ||
| 56 | } | ||
| 57 | |||
| 58 | /* | ||
| 59 | * this is pretty lame, but, whatever. | ||
| 60 | */ | ||
| 61 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | ||
| 62 | size_t size) | ||
| 63 | { | ||
| 64 | struct rds_tcp_incoming *tinc; | ||
| 65 | struct iovec *iov, tmp; | ||
| 66 | struct sk_buff *skb; | ||
| 67 | unsigned long to_copy, skb_off; | ||
| 68 | int ret = 0; | ||
| 69 | |||
| 70 | if (size == 0) | ||
| 71 | goto out; | ||
| 72 | |||
| 73 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | ||
| 74 | iov = first_iov; | ||
| 75 | tmp = *iov; | ||
| 76 | |||
| 77 | skb_queue_walk(&tinc->ti_skb_list, skb) { | ||
| 78 | skb_off = 0; | ||
| 79 | while (skb_off < skb->len) { | ||
| 80 | while (tmp.iov_len == 0) { | ||
| 81 | iov++; | ||
| 82 | tmp = *iov; | ||
| 83 | } | ||
| 84 | |||
| 85 | to_copy = min(tmp.iov_len, size); | ||
| 86 | to_copy = min(to_copy, skb->len - skb_off); | ||
| 87 | |||
| 88 | rdsdebug("ret %d size %zu skb %p skb_off %lu " | ||
| 89 | "skblen %d iov_base %p iov_len %zu cpy %lu\n", | ||
| 90 | ret, size, skb, skb_off, skb->len, | ||
| 91 | tmp.iov_base, tmp.iov_len, to_copy); | ||
| 92 | |||
| 93 | /* modifies tmp as it copies */ | ||
| 94 | if (skb_copy_datagram_iovec(skb, skb_off, &tmp, | ||
| 95 | to_copy)) { | ||
| 96 | ret = -EFAULT; | ||
| 97 | goto out; | ||
| 98 | } | ||
| 99 | |||
| 100 | size -= to_copy; | ||
| 101 | ret += to_copy; | ||
| 102 | skb_off += to_copy; | ||
| 103 | if (size == 0) | ||
| 104 | goto out; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | out: | ||
| 108 | return ret; | ||
| 109 | } | ||
| 110 | |||
| 111 | /* | ||
| 112 | * We have a series of skbs that have fragmented pieces of the congestion | ||
| 113 | * bitmap. They must add up to the exact size of the congestion bitmap. We | ||
| 114 | * use the skb helpers to copy those into the pages that make up the in-memory | ||
| 115 | * congestion bitmap for the remote address of this connection. We then tell | ||
| 116 | * the congestion core that the bitmap has been changed so that it can wake up | ||
| 117 | * sleepers. | ||
| 118 | * | ||
| 119 | * This is racing with sending paths which are using test_bit to see if the | ||
| 120 | * bitmap indicates that their recipient is congested. | ||
| 121 | */ | ||
| 122 | |||
| 123 | static void rds_tcp_cong_recv(struct rds_connection *conn, | ||
| 124 | struct rds_tcp_incoming *tinc) | ||
| 125 | { | ||
| 126 | struct sk_buff *skb; | ||
| 127 | unsigned int to_copy, skb_off; | ||
| 128 | unsigned int map_off; | ||
| 129 | unsigned int map_page; | ||
| 130 | struct rds_cong_map *map; | ||
| 131 | int ret; | ||
| 132 | |||
| 133 | /* catch completely corrupt packets */ | ||
| 134 | if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) | ||
| 135 | return; | ||
| 136 | |||
| 137 | map_page = 0; | ||
| 138 | map_off = 0; | ||
| 139 | map = conn->c_fcong; | ||
| 140 | |||
| 141 | skb_queue_walk(&tinc->ti_skb_list, skb) { | ||
| 142 | skb_off = 0; | ||
| 143 | while (skb_off < skb->len) { | ||
| 144 | to_copy = min_t(unsigned int, PAGE_SIZE - map_off, | ||
| 145 | skb->len - skb_off); | ||
| 146 | |||
| 147 | BUG_ON(map_page >= RDS_CONG_MAP_PAGES); | ||
| 148 | |||
| 149 | /* only returns 0 or -error */ | ||
| 150 | ret = skb_copy_bits(skb, skb_off, | ||
| 151 | (void *)map->m_page_addrs[map_page] + map_off, | ||
| 152 | to_copy); | ||
| 153 | BUG_ON(ret != 0); | ||
| 154 | |||
| 155 | skb_off += to_copy; | ||
| 156 | map_off += to_copy; | ||
| 157 | if (map_off == PAGE_SIZE) { | ||
| 158 | map_off = 0; | ||
| 159 | map_page++; | ||
| 160 | } | ||
| 161 | } | ||
| 162 | } | ||
| 163 | |||
| 164 | rds_cong_map_updated(map, ~(u64) 0); | ||
| 165 | } | ||
| 166 | |||
| 167 | struct rds_tcp_desc_arg { | ||
| 168 | struct rds_connection *conn; | ||
| 169 | gfp_t gfp; | ||
| 170 | enum km_type km; | ||
| 171 | }; | ||
| 172 | |||
| 173 | static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | ||
| 174 | unsigned int offset, size_t len) | ||
| 175 | { | ||
| 176 | struct rds_tcp_desc_arg *arg = desc->arg.data; | ||
| 177 | struct rds_connection *conn = arg->conn; | ||
| 178 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 179 | struct rds_tcp_incoming *tinc = tc->t_tinc; | ||
| 180 | struct sk_buff *clone; | ||
| 181 | size_t left = len, to_copy; | ||
| 182 | |||
| 183 | rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, | ||
| 184 | len); | ||
| 185 | |||
| 186 | /* | ||
| 187 | * tcp_read_sock() interprets partial progress as an indication to stop | ||
| 188 | * processing. | ||
| 189 | */ | ||
| 190 | while (left) { | ||
| 191 | if (tinc == NULL) { | ||
| 192 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, | ||
| 193 | arg->gfp); | ||
| 194 | if (tinc == NULL) { | ||
| 195 | desc->error = -ENOMEM; | ||
| 196 | goto out; | ||
| 197 | } | ||
| 198 | tc->t_tinc = tinc; | ||
| 199 | rdsdebug("alloced tinc %p\n", tinc); | ||
| 200 | rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); | ||
| 201 | /* | ||
| 202 | * XXX * we might be able to use the __ variants when | ||
| 203 | * we've already serialized at a higher level. | ||
| 204 | */ | ||
| 205 | skb_queue_head_init(&tinc->ti_skb_list); | ||
| 206 | } | ||
| 207 | |||
| 208 | if (left && tc->t_tinc_hdr_rem) { | ||
| 209 | to_copy = min(tc->t_tinc_hdr_rem, left); | ||
| 210 | rdsdebug("copying %zu header from skb %p\n", to_copy, | ||
| 211 | skb); | ||
| 212 | skb_copy_bits(skb, offset, | ||
| 213 | (char *)&tinc->ti_inc.i_hdr + | ||
| 214 | sizeof(struct rds_header) - | ||
| 215 | tc->t_tinc_hdr_rem, | ||
| 216 | to_copy); | ||
| 217 | tc->t_tinc_hdr_rem -= to_copy; | ||
| 218 | left -= to_copy; | ||
| 219 | offset += to_copy; | ||
| 220 | |||
| 221 | if (tc->t_tinc_hdr_rem == 0) { | ||
| 222 | /* could be 0 for a 0 len message */ | ||
| 223 | tc->t_tinc_data_rem = | ||
| 224 | be32_to_cpu(tinc->ti_inc.i_hdr.h_len); | ||
| 225 | } | ||
| 226 | } | ||
| 227 | |||
| 228 | if (left && tc->t_tinc_data_rem) { | ||
| 229 | clone = skb_clone(skb, arg->gfp); | ||
| 230 | if (clone == NULL) { | ||
| 231 | desc->error = -ENOMEM; | ||
| 232 | goto out; | ||
| 233 | } | ||
| 234 | |||
| 235 | to_copy = min(tc->t_tinc_data_rem, left); | ||
| 236 | pskb_pull(clone, offset); | ||
| 237 | pskb_trim(clone, to_copy); | ||
| 238 | skb_queue_tail(&tinc->ti_skb_list, clone); | ||
| 239 | |||
| 240 | rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " | ||
| 241 | "clone %p data %p len %d\n", | ||
| 242 | skb, skb->data, skb->len, offset, to_copy, | ||
| 243 | clone, clone->data, clone->len); | ||
| 244 | |||
| 245 | tc->t_tinc_data_rem -= to_copy; | ||
| 246 | left -= to_copy; | ||
| 247 | offset += to_copy; | ||
| 248 | } | ||
| 249 | |||
| 250 | if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { | ||
| 251 | if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) | ||
| 252 | rds_tcp_cong_recv(conn, tinc); | ||
| 253 | else | ||
| 254 | rds_recv_incoming(conn, conn->c_faddr, | ||
| 255 | conn->c_laddr, &tinc->ti_inc, | ||
| 256 | arg->gfp, arg->km); | ||
| 257 | |||
| 258 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); | ||
| 259 | tc->t_tinc_data_rem = 0; | ||
| 260 | tc->t_tinc = NULL; | ||
| 261 | rds_inc_put(&tinc->ti_inc); | ||
| 262 | tinc = NULL; | ||
| 263 | } | ||
| 264 | } | ||
| 265 | out: | ||
| 266 | rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", | ||
| 267 | len, left, skb->len, | ||
| 268 | skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); | ||
| 269 | return len - left; | ||
| 270 | } | ||
| 271 | |||
| 272 | /* the caller has to hold the sock lock */ | ||
| 273 | int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) | ||
| 274 | { | ||
| 275 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 276 | struct socket *sock = tc->t_sock; | ||
| 277 | read_descriptor_t desc; | ||
| 278 | struct rds_tcp_desc_arg arg; | ||
| 279 | |||
| 280 | /* It's like glib in the kernel! */ | ||
| 281 | arg.conn = conn; | ||
| 282 | arg.gfp = gfp; | ||
| 283 | arg.km = km; | ||
| 284 | desc.arg.data = &arg; | ||
| 285 | desc.error = 0; | ||
| 286 | desc.count = 1; /* give more than one skb per call */ | ||
| 287 | |||
| 288 | tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); | ||
| 289 | rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, | ||
| 290 | desc.error); | ||
| 291 | |||
| 292 | return desc.error; | ||
| 293 | } | ||
| 294 | |||
| 295 | /* | ||
| 296 | * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from | ||
| 297 | * data_ready. | ||
| 298 | * | ||
| 299 | * if we fail to allocate we're in trouble.. blindly wait some time before | ||
| 300 | * trying again to see if the VM can free up something for us. | ||
| 301 | */ | ||
| 302 | int rds_tcp_recv(struct rds_connection *conn) | ||
| 303 | { | ||
| 304 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 305 | struct socket *sock = tc->t_sock; | ||
| 306 | int ret = 0; | ||
| 307 | |||
| 308 | rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); | ||
| 309 | |||
| 310 | lock_sock(sock->sk); | ||
| 311 | ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); | ||
| 312 | release_sock(sock->sk); | ||
| 313 | |||
| 314 | return ret; | ||
| 315 | } | ||
| 316 | |||
| 317 | void rds_tcp_data_ready(struct sock *sk, int bytes) | ||
| 318 | { | ||
| 319 | void (*ready)(struct sock *sk, int bytes); | ||
| 320 | struct rds_connection *conn; | ||
| 321 | struct rds_tcp_connection *tc; | ||
| 322 | |||
| 323 | rdsdebug("data ready sk %p bytes %d\n", sk, bytes); | ||
| 324 | |||
| 325 | read_lock(&sk->sk_callback_lock); | ||
| 326 | conn = sk->sk_user_data; | ||
| 327 | if (conn == NULL) { /* check for teardown race */ | ||
| 328 | ready = sk->sk_data_ready; | ||
| 329 | goto out; | ||
| 330 | } | ||
| 331 | |||
| 332 | tc = conn->c_transport_data; | ||
| 333 | ready = tc->t_orig_data_ready; | ||
| 334 | rds_tcp_stats_inc(s_tcp_data_ready_calls); | ||
| 335 | |||
| 336 | if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) | ||
| 337 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
| 338 | out: | ||
| 339 | read_unlock(&sk->sk_callback_lock); | ||
| 340 | ready(sk, bytes); | ||
| 341 | } | ||
| 342 | |||
| 343 | int __init rds_tcp_recv_init(void) | ||
| 344 | { | ||
| 345 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", | ||
| 346 | sizeof(struct rds_tcp_incoming), | ||
| 347 | 0, 0, NULL); | ||
| 348 | if (rds_tcp_incoming_slab == NULL) | ||
| 349 | return -ENOMEM; | ||
| 350 | return 0; | ||
| 351 | } | ||
| 352 | |||
| 353 | void rds_tcp_recv_exit(void) | ||
| 354 | { | ||
| 355 | kmem_cache_destroy(rds_tcp_incoming_slab); | ||
| 356 | } | ||
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c new file mode 100644 index 000000000000..ab545e0cd5d6 --- /dev/null +++ b/net/rds/tcp_send.c | |||
| @@ -0,0 +1,263 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <linux/in.h> | ||
| 35 | #include <net/tcp.h> | ||
| 36 | |||
| 37 | #include "rds.h" | ||
| 38 | #include "tcp.h" | ||
| 39 | |||
| 40 | static void rds_tcp_cork(struct socket *sock, int val) | ||
| 41 | { | ||
| 42 | mm_segment_t oldfs; | ||
| 43 | |||
| 44 | oldfs = get_fs(); | ||
| 45 | set_fs(KERNEL_DS); | ||
| 46 | sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val, | ||
| 47 | sizeof(val)); | ||
| 48 | set_fs(oldfs); | ||
| 49 | } | ||
| 50 | |||
| 51 | void rds_tcp_xmit_prepare(struct rds_connection *conn) | ||
| 52 | { | ||
| 53 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 54 | |||
| 55 | rds_tcp_cork(tc->t_sock, 1); | ||
| 56 | } | ||
| 57 | |||
| 58 | void rds_tcp_xmit_complete(struct rds_connection *conn) | ||
| 59 | { | ||
| 60 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 61 | |||
| 62 | rds_tcp_cork(tc->t_sock, 0); | ||
| 63 | } | ||
| 64 | |||
| 65 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
| 66 | int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) | ||
| 67 | { | ||
| 68 | struct kvec vec = { | ||
| 69 | .iov_base = data, | ||
| 70 | .iov_len = len, | ||
| 71 | }; | ||
| 72 | struct msghdr msg = { | ||
| 73 | .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, | ||
| 74 | }; | ||
| 75 | |||
| 76 | return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); | ||
| 77 | } | ||
| 78 | |||
| 79 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
| 80 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
| 81 | struct rds_cong_map *map, unsigned long offset) | ||
| 82 | { | ||
| 83 | static struct rds_header rds_tcp_map_header = { | ||
| 84 | .h_flags = RDS_FLAG_CONG_BITMAP, | ||
| 85 | }; | ||
| 86 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 87 | unsigned long i; | ||
| 88 | int ret; | ||
| 89 | int copied = 0; | ||
| 90 | |||
| 91 | /* Some problem claims cpu_to_be32(constant) isn't a constant. */ | ||
| 92 | rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); | ||
| 93 | |||
| 94 | if (offset < sizeof(struct rds_header)) { | ||
| 95 | ret = rds_tcp_sendmsg(tc->t_sock, | ||
| 96 | (void *)&rds_tcp_map_header + offset, | ||
| 97 | sizeof(struct rds_header) - offset); | ||
| 98 | if (ret <= 0) | ||
| 99 | return ret; | ||
| 100 | offset += ret; | ||
| 101 | copied = ret; | ||
| 102 | if (offset < sizeof(struct rds_header)) | ||
| 103 | return ret; | ||
| 104 | } | ||
| 105 | |||
| 106 | offset -= sizeof(struct rds_header); | ||
| 107 | i = offset / PAGE_SIZE; | ||
| 108 | offset = offset % PAGE_SIZE; | ||
| 109 | BUG_ON(i >= RDS_CONG_MAP_PAGES); | ||
| 110 | |||
| 111 | do { | ||
| 112 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | ||
| 113 | virt_to_page(map->m_page_addrs[i]), | ||
| 114 | offset, PAGE_SIZE - offset, | ||
| 115 | MSG_DONTWAIT); | ||
| 116 | if (ret <= 0) | ||
| 117 | break; | ||
| 118 | copied += ret; | ||
| 119 | offset += ret; | ||
| 120 | if (offset == PAGE_SIZE) { | ||
| 121 | offset = 0; | ||
| 122 | i++; | ||
| 123 | } | ||
| 124 | } while (i < RDS_CONG_MAP_PAGES); | ||
| 125 | |||
| 126 | return copied ? copied : ret; | ||
| 127 | } | ||
| 128 | |||
| 129 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
| 130 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
| 131 | unsigned int hdr_off, unsigned int sg, unsigned int off) | ||
| 132 | { | ||
| 133 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
| 134 | int done = 0; | ||
| 135 | int ret = 0; | ||
| 136 | |||
| 137 | if (hdr_off == 0) { | ||
| 138 | /* | ||
| 139 | * m_ack_seq is set to the sequence number of the last byte of | ||
| 140 | * header and data. see rds_tcp_is_acked(). | ||
| 141 | */ | ||
| 142 | tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); | ||
| 143 | rm->m_ack_seq = tc->t_last_sent_nxt + | ||
| 144 | sizeof(struct rds_header) + | ||
| 145 | be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; | ||
| 146 | smp_mb__before_clear_bit(); | ||
| 147 | set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); | ||
| 148 | tc->t_last_expected_una = rm->m_ack_seq + 1; | ||
| 149 | |||
| 150 | rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", | ||
| 151 | rm, rds_tcp_snd_nxt(tc), | ||
| 152 | (unsigned long long)rm->m_ack_seq); | ||
| 153 | } | ||
| 154 | |||
| 155 | if (hdr_off < sizeof(struct rds_header)) { | ||
| 156 | /* see rds_tcp_write_space() */ | ||
| 157 | set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); | ||
| 158 | |||
| 159 | ret = rds_tcp_sendmsg(tc->t_sock, | ||
| 160 | (void *)&rm->m_inc.i_hdr + hdr_off, | ||
| 161 | sizeof(rm->m_inc.i_hdr) - hdr_off); | ||
| 162 | if (ret < 0) | ||
| 163 | goto out; | ||
| 164 | done += ret; | ||
| 165 | if (hdr_off + done != sizeof(struct rds_header)) | ||
| 166 | goto out; | ||
| 167 | } | ||
| 168 | |||
| 169 | while (sg < rm->m_nents) { | ||
| 170 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | ||
| 171 | sg_page(&rm->m_sg[sg]), | ||
| 172 | rm->m_sg[sg].offset + off, | ||
| 173 | rm->m_sg[sg].length - off, | ||
| 174 | MSG_DONTWAIT|MSG_NOSIGNAL); | ||
| 175 | rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), | ||
| 176 | rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, | ||
| 177 | ret); | ||
| 178 | if (ret <= 0) | ||
| 179 | break; | ||
| 180 | |||
| 181 | off += ret; | ||
| 182 | done += ret; | ||
| 183 | if (off == rm->m_sg[sg].length) { | ||
| 184 | off = 0; | ||
| 185 | sg++; | ||
| 186 | } | ||
| 187 | } | ||
| 188 | |||
| 189 | out: | ||
| 190 | if (ret <= 0) { | ||
| 191 | /* write_space will hit after EAGAIN, all else fatal */ | ||
| 192 | if (ret == -EAGAIN) { | ||
| 193 | rds_tcp_stats_inc(s_tcp_sndbuf_full); | ||
| 194 | ret = 0; | ||
| 195 | } else { | ||
| 196 | printk(KERN_WARNING "RDS/tcp: send to %u.%u.%u.%u " | ||
| 197 | "returned %d, disconnecting and reconnecting\n", | ||
| 198 | NIPQUAD(conn->c_faddr), ret); | ||
| 199 | rds_conn_drop(conn); | ||
| 200 | } | ||
| 201 | } | ||
| 202 | if (done == 0) | ||
| 203 | done = ret; | ||
| 204 | return done; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* | ||
| 208 | * rm->m_ack_seq is set to the tcp sequence number that corresponds to the | ||
| 209 | * last byte of the message, including the header. This means that the | ||
| 210 | * entire message has been received if rm->m_ack_seq is "before" the next | ||
| 211 | * unacked byte of the TCP sequence space. We have to do very careful | ||
| 212 | * wrapping 32bit comparisons here. | ||
| 213 | */ | ||
| 214 | static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) | ||
| 215 | { | ||
| 216 | if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) | ||
| 217 | return 0; | ||
| 218 | return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; | ||
| 219 | } | ||
| 220 | |||
| 221 | void rds_tcp_write_space(struct sock *sk) | ||
| 222 | { | ||
| 223 | void (*write_space)(struct sock *sk); | ||
| 224 | struct rds_connection *conn; | ||
| 225 | struct rds_tcp_connection *tc; | ||
| 226 | |||
| 227 | read_lock(&sk->sk_callback_lock); | ||
| 228 | conn = sk->sk_user_data; | ||
| 229 | if (conn == NULL) { | ||
| 230 | write_space = sk->sk_write_space; | ||
| 231 | goto out; | ||
| 232 | } | ||
| 233 | |||
| 234 | tc = conn->c_transport_data; | ||
| 235 | rdsdebug("write_space for tc %p\n", tc); | ||
| 236 | write_space = tc->t_orig_write_space; | ||
| 237 | rds_tcp_stats_inc(s_tcp_write_space_calls); | ||
| 238 | |||
| 239 | rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); | ||
| 240 | tc->t_last_seen_una = rds_tcp_snd_una(tc); | ||
| 241 | rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); | ||
| 242 | |||
| 243 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
| 244 | out: | ||
| 245 | read_unlock(&sk->sk_callback_lock); | ||
| 246 | |||
| 247 | /* | ||
| 248 | * write_space is only called when data leaves tcp's send queue if | ||
| 249 | * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put | ||
| 250 | * data in tcp's send queue because we use write_space to parse the | ||
| 251 | * sequence numbers and notice that rds messages have been fully | ||
| 252 | * received. | ||
| 253 | * | ||
| 254 | * tcp's write_space clears SOCK_NOSPACE if the send queue has more | ||
| 255 | * than a certain amount of space. So we need to set it again *after* | ||
| 256 | * we call tcp's write_space or else we might only get called on the | ||
| 257 | * first of a series of incoming tcp acks. | ||
| 258 | */ | ||
| 259 | write_space(sk); | ||
| 260 | |||
| 261 | if (sk->sk_socket) | ||
| 262 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
| 263 | } | ||
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c new file mode 100644 index 000000000000..d5898d03cd68 --- /dev/null +++ b/net/rds/tcp_stats.c | |||
| @@ -0,0 +1,74 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/percpu.h> | ||
| 34 | #include <linux/seq_file.h> | ||
| 35 | #include <linux/proc_fs.h> | ||
| 36 | |||
| 37 | #include "rds.h" | ||
| 38 | #include "tcp.h" | ||
| 39 | |||
| 40 | DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) | ||
| 41 | ____cacheline_aligned; | ||
| 42 | |||
| 43 | static const char const *rds_tcp_stat_names[] = { | ||
| 44 | "tcp_data_ready_calls", | ||
| 45 | "tcp_write_space_calls", | ||
| 46 | "tcp_sndbuf_full", | ||
| 47 | "tcp_connect_raced", | ||
| 48 | "tcp_listen_closed_stale", | ||
| 49 | }; | ||
| 50 | |||
| 51 | unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, | ||
| 52 | unsigned int avail) | ||
| 53 | { | ||
| 54 | struct rds_tcp_statistics stats = {0, }; | ||
| 55 | uint64_t *src; | ||
| 56 | uint64_t *sum; | ||
| 57 | size_t i; | ||
| 58 | int cpu; | ||
| 59 | |||
| 60 | if (avail < ARRAY_SIZE(rds_tcp_stat_names)) | ||
| 61 | goto out; | ||
| 62 | |||
| 63 | for_each_online_cpu(cpu) { | ||
| 64 | src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); | ||
| 65 | sum = (uint64_t *)&stats; | ||
| 66 | for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) | ||
| 67 | *(sum++) += *(src++); | ||
| 68 | } | ||
| 69 | |||
| 70 | rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, | ||
| 71 | ARRAY_SIZE(rds_tcp_stat_names)); | ||
| 72 | out: | ||
| 73 | return ARRAY_SIZE(rds_tcp_stat_names); | ||
| 74 | } | ||
