diff options
| author | David S. Miller <davem@davemloft.net> | 2016-03-18 22:25:46 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2016-03-18 22:25:46 -0400 |
| commit | 7fa7728d23e7928dfb6e9f5540a97fb2d562b989 (patch) | |
| tree | 4b57cd88bdb39ff9025cafb3f9b8fcbe22660b6c | |
| parent | 79d3b59a93ba25f3b2c72eb4099c189d41d30204 (diff) | |
| parent | a3382e408b645b4f68ec01f9c048e356c62598fb (diff) | |
Merge branch 'rds-buffer-tuning'
Sowmini Varadhan says:
====================
RDS: TCP: tunable socket buffer parameters
Patch 1 uses sysctl to create tunable socket buffer size parameters.
Patch 2 removes an unuused constant.
v2: use sysctl
v3: review comments from Santosh Shilimkar, Eric Dumazet
v4: review comments from Hannes Sowa
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | net/rds/tcp.c | 145 |
1 files changed, 134 insertions, 11 deletions
diff --git a/net/rds/tcp.c b/net/rds/tcp.c index ad60299b088b..61ed2a8764ba 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c | |||
| @@ -52,7 +52,34 @@ static LIST_HEAD(rds_tcp_conn_list); | |||
| 52 | 52 | ||
| 53 | static struct kmem_cache *rds_tcp_conn_slab; | 53 | static struct kmem_cache *rds_tcp_conn_slab; |
| 54 | 54 | ||
| 55 | #define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) | 55 | static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, |
| 56 | void __user *buffer, size_t *lenp, | ||
| 57 | loff_t *fpos); | ||
| 58 | |||
| 59 | int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; | ||
| 60 | int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; | ||
| 61 | |||
| 62 | static struct ctl_table rds_tcp_sysctl_table[] = { | ||
| 63 | #define RDS_TCP_SNDBUF 0 | ||
| 64 | { | ||
| 65 | .procname = "rds_tcp_sndbuf", | ||
| 66 | /* data is per-net pointer */ | ||
| 67 | .maxlen = sizeof(int), | ||
| 68 | .mode = 0644, | ||
| 69 | .proc_handler = rds_tcp_skbuf_handler, | ||
| 70 | .extra1 = &rds_tcp_min_sndbuf, | ||
| 71 | }, | ||
| 72 | #define RDS_TCP_RCVBUF 1 | ||
| 73 | { | ||
| 74 | .procname = "rds_tcp_rcvbuf", | ||
| 75 | /* data is per-net pointer */ | ||
| 76 | .maxlen = sizeof(int), | ||
| 77 | .mode = 0644, | ||
| 78 | .proc_handler = rds_tcp_skbuf_handler, | ||
| 79 | .extra1 = &rds_tcp_min_rcvbuf, | ||
| 80 | }, | ||
| 81 | { } | ||
| 82 | }; | ||
| 56 | 83 | ||
| 57 | /* doing it this way avoids calling tcp_sk() */ | 84 | /* doing it this way avoids calling tcp_sk() */ |
| 58 | void rds_tcp_nonagle(struct socket *sock) | 85 | void rds_tcp_nonagle(struct socket *sock) |
| @@ -66,15 +93,6 @@ void rds_tcp_nonagle(struct socket *sock) | |||
| 66 | set_fs(oldfs); | 93 | set_fs(oldfs); |
| 67 | } | 94 | } |
| 68 | 95 | ||
| 69 | /* All module specific customizations to the RDS-TCP socket should be done in | ||
| 70 | * rds_tcp_tune() and applied after socket creation. In general these | ||
| 71 | * customizations should be tunable via module_param() | ||
| 72 | */ | ||
| 73 | void rds_tcp_tune(struct socket *sock) | ||
| 74 | { | ||
| 75 | rds_tcp_nonagle(sock); | ||
| 76 | } | ||
| 77 | |||
| 78 | u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) | 96 | u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) |
| 79 | { | 97 | { |
| 80 | return tcp_sk(tc->t_sock->sk)->snd_nxt; | 98 | return tcp_sk(tc->t_sock->sk)->snd_nxt; |
| @@ -272,8 +290,34 @@ static int rds_tcp_netid; | |||
| 272 | struct rds_tcp_net { | 290 | struct rds_tcp_net { |
| 273 | struct socket *rds_tcp_listen_sock; | 291 | struct socket *rds_tcp_listen_sock; |
| 274 | struct work_struct rds_tcp_accept_w; | 292 | struct work_struct rds_tcp_accept_w; |
| 293 | struct ctl_table_header *rds_tcp_sysctl; | ||
| 294 | struct ctl_table *ctl_table; | ||
| 295 | int sndbuf_size; | ||
| 296 | int rcvbuf_size; | ||
| 275 | }; | 297 | }; |
| 276 | 298 | ||
| 299 | /* All module specific customizations to the RDS-TCP socket should be done in | ||
| 300 | * rds_tcp_tune() and applied after socket creation. | ||
| 301 | */ | ||
| 302 | void rds_tcp_tune(struct socket *sock) | ||
| 303 | { | ||
| 304 | struct sock *sk = sock->sk; | ||
| 305 | struct net *net = sock_net(sk); | ||
| 306 | struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); | ||
| 307 | |||
| 308 | rds_tcp_nonagle(sock); | ||
| 309 | lock_sock(sk); | ||
| 310 | if (rtn->sndbuf_size > 0) { | ||
| 311 | sk->sk_sndbuf = rtn->sndbuf_size; | ||
| 312 | sk->sk_userlocks |= SOCK_SNDBUF_LOCK; | ||
| 313 | } | ||
| 314 | if (rtn->rcvbuf_size > 0) { | ||
| 315 | sk->sk_sndbuf = rtn->rcvbuf_size; | ||
| 316 | sk->sk_userlocks |= SOCK_RCVBUF_LOCK; | ||
| 317 | } | ||
| 318 | release_sock(sk); | ||
| 319 | } | ||
| 320 | |||
| 277 | static void rds_tcp_accept_worker(struct work_struct *work) | 321 | static void rds_tcp_accept_worker(struct work_struct *work) |
| 278 | { | 322 | { |
| 279 | struct rds_tcp_net *rtn = container_of(work, | 323 | struct rds_tcp_net *rtn = container_of(work, |
| @@ -295,20 +339,60 @@ void rds_tcp_accept_work(struct sock *sk) | |||
| 295 | static __net_init int rds_tcp_init_net(struct net *net) | 339 | static __net_init int rds_tcp_init_net(struct net *net) |
| 296 | { | 340 | { |
| 297 | struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); | 341 | struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); |
| 342 | struct ctl_table *tbl; | ||
| 343 | int err = 0; | ||
| 344 | |||
| 345 | memset(rtn, 0, sizeof(*rtn)); | ||
| 298 | 346 | ||
| 347 | /* {snd, rcv}buf_size default to 0, which implies we let the | ||
| 348 | * stack pick the value, and permit auto-tuning of buffer size. | ||
| 349 | */ | ||
| 350 | if (net == &init_net) { | ||
| 351 | tbl = rds_tcp_sysctl_table; | ||
| 352 | } else { | ||
| 353 | tbl = kmemdup(rds_tcp_sysctl_table, | ||
| 354 | sizeof(rds_tcp_sysctl_table), GFP_KERNEL); | ||
| 355 | if (!tbl) { | ||
| 356 | pr_warn("could not set allocate syctl table\n"); | ||
| 357 | return -ENOMEM; | ||
| 358 | } | ||
| 359 | rtn->ctl_table = tbl; | ||
| 360 | } | ||
| 361 | tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size; | ||
| 362 | tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size; | ||
| 363 | rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl); | ||
| 364 | if (!rtn->rds_tcp_sysctl) { | ||
| 365 | pr_warn("could not register sysctl\n"); | ||
| 366 | err = -ENOMEM; | ||
| 367 | goto fail; | ||
| 368 | } | ||
| 299 | rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); | 369 | rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); |
| 300 | if (!rtn->rds_tcp_listen_sock) { | 370 | if (!rtn->rds_tcp_listen_sock) { |
| 301 | pr_warn("could not set up listen sock\n"); | 371 | pr_warn("could not set up listen sock\n"); |
| 302 | return -EAFNOSUPPORT; | 372 | unregister_net_sysctl_table(rtn->rds_tcp_sysctl); |
| 373 | rtn->rds_tcp_sysctl = NULL; | ||
| 374 | err = -EAFNOSUPPORT; | ||
| 375 | goto fail; | ||
| 303 | } | 376 | } |
| 304 | INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); | 377 | INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); |
| 305 | return 0; | 378 | return 0; |
| 379 | |||
| 380 | fail: | ||
| 381 | if (net != &init_net) | ||
| 382 | kfree(tbl); | ||
| 383 | return err; | ||
| 306 | } | 384 | } |
| 307 | 385 | ||
| 308 | static void __net_exit rds_tcp_exit_net(struct net *net) | 386 | static void __net_exit rds_tcp_exit_net(struct net *net) |
| 309 | { | 387 | { |
| 310 | struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); | 388 | struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); |
| 311 | 389 | ||
| 390 | if (rtn->rds_tcp_sysctl) | ||
| 391 | unregister_net_sysctl_table(rtn->rds_tcp_sysctl); | ||
| 392 | |||
| 393 | if (net != &init_net && rtn->ctl_table) | ||
| 394 | kfree(rtn->ctl_table); | ||
| 395 | |||
| 312 | /* If rds_tcp_exit_net() is called as a result of netns deletion, | 396 | /* If rds_tcp_exit_net() is called as a result of netns deletion, |
| 313 | * the rds_tcp_kill_sock() device notifier would already have cleaned | 397 | * the rds_tcp_kill_sock() device notifier would already have cleaned |
| 314 | * up the listen socket, thus there is no work to do in this function. | 398 | * up the listen socket, thus there is no work to do in this function. |
| @@ -383,6 +467,45 @@ static struct notifier_block rds_tcp_dev_notifier = { | |||
| 383 | .priority = -10, /* must be called after other network notifiers */ | 467 | .priority = -10, /* must be called after other network notifiers */ |
| 384 | }; | 468 | }; |
| 385 | 469 | ||
| 470 | /* when sysctl is used to modify some kernel socket parameters,this | ||
| 471 | * function resets the RDS connections in that netns so that we can | ||
| 472 | * restart with new parameters. The assumption is that such reset | ||
| 473 | * events are few and far-between. | ||
| 474 | */ | ||
| 475 | static void rds_tcp_sysctl_reset(struct net *net) | ||
| 476 | { | ||
| 477 | struct rds_tcp_connection *tc, *_tc; | ||
| 478 | |||
| 479 | spin_lock_irq(&rds_tcp_conn_lock); | ||
| 480 | list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { | ||
| 481 | struct net *c_net = read_pnet(&tc->conn->c_net); | ||
| 482 | |||
| 483 | if (net != c_net || !tc->t_sock) | ||
| 484 | continue; | ||
| 485 | |||
| 486 | rds_conn_drop(tc->conn); /* reconnect with new parameters */ | ||
| 487 | } | ||
| 488 | spin_unlock_irq(&rds_tcp_conn_lock); | ||
| 489 | } | ||
| 490 | |||
| 491 | static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, | ||
| 492 | void __user *buffer, size_t *lenp, | ||
| 493 | loff_t *fpos) | ||
| 494 | { | ||
| 495 | struct net *net = current->nsproxy->net_ns; | ||
| 496 | int err; | ||
| 497 | |||
| 498 | err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); | ||
| 499 | if (err < 0) { | ||
| 500 | pr_warn("Invalid input. Must be >= %d\n", | ||
| 501 | *(int *)(ctl->extra1)); | ||
| 502 | return err; | ||
| 503 | } | ||
| 504 | if (write) | ||
| 505 | rds_tcp_sysctl_reset(net); | ||
| 506 | return 0; | ||
| 507 | } | ||
| 508 | |||
| 386 | static void rds_tcp_exit(void) | 509 | static void rds_tcp_exit(void) |
| 387 | { | 510 | { |
| 388 | rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); | 511 | rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); |
