aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/Makefile1
-rw-r--r--net/9p/client.c161
-rw-r--r--net/9p/fcprint.c4
-rw-r--r--net/9p/mod.c9
-rw-r--r--net/9p/mux.c1060
-rw-r--r--net/9p/trans_fd.c1103
-rw-r--r--net/9p/trans_virtio.c357
-rw-r--r--net/9p/util.c20
-rw-r--r--net/bluetooth/hidp/core.c49
-rw-r--r--net/bluetooth/rfcomm/tty.c3
-rw-r--r--net/core/net_namespace.c4
-rw-r--r--net/core/rtnetlink.c44
-rw-r--r--net/core/skbuff.c29
-rw-r--r--net/dccp/dccp.h2
-rw-r--r--net/dccp/ipv4.c18
-rw-r--r--net/dccp/ipv6.c20
-rw-r--r--net/dccp/proto.c18
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/fib_trie.c3
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/inet_connection_sock.c8
-rw-r--r--net/ipv4/inet_hashtables.c64
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c3
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_ipv4.c31
-rw-r--r--net/ipv4/xfrm4_mode_beet.c2
-rw-r--r--net/ipv6/icmp.c3
-rw-r--r--net/ipv6/inet6_hashtables.c6
-rw-r--r--net/ipv6/tcp_ipv6.c19
-rw-r--r--net/mac80211/Kconfig13
-rw-r--r--net/mac80211/ieee80211.c14
-rw-r--r--net/mac80211/rc80211_pid_algo.c2
-rw-r--r--net/mac80211/rc80211_simple.c2
-rw-r--r--net/mac80211/rx.c7
-rw-r--r--net/netlabel/netlabel_cipso_v4.c2
-rw-r--r--net/netlabel/netlabel_cipso_v4.h3
-rw-r--r--net/netlabel/netlabel_domainhash.h1
-rw-r--r--net/netlabel/netlabel_kapi.c177
-rw-r--r--net/rfkill/rfkill.c2
-rw-r--r--net/sched/cls_flow.c21
-rw-r--r--net/sched/em_meta.c17
-rw-r--r--net/sctp/auth.c6
-rw-r--r--net/sctp/sm_statefuns.c8
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c93
-rw-r--r--net/sunrpc/cache.c152
-rw-r--r--net/sunrpc/stats.c7
-rw-r--r--net/sunrpc/sunrpc_syms.c52
-rw-r--r--net/sunrpc/svc.c90
-rw-r--r--net/sunrpc/svc_xprt.c1055
-rw-r--r--net/sunrpc/svcauth.c6
-rw-r--r--net/sunrpc/svcauth_unix.c59
-rw-r--r--net/sunrpc/svcsock.c1311
-rw-r--r--net/sunrpc/sysctl.c31
-rw-r--r--net/sunrpc/xdr.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile5
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c266
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c412
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c586
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c520
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c1080
61 files changed, 6372 insertions, 2689 deletions
diff --git a/net/9p/Makefile b/net/9p/Makefile
index d3abb246ccab..8a1051101898 100644
--- a/net/9p/Makefile
+++ b/net/9p/Makefile
@@ -4,7 +4,6 @@ obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
4 4
59pnet-objs := \ 59pnet-objs := \
6 mod.o \ 6 mod.o \
7 mux.o \
8 client.o \ 7 client.o \
9 conv.o \ 8 conv.o \
10 error.o \ 9 error.o \
diff --git a/net/9p/client.c b/net/9p/client.c
index af9199364049..84e087e24146 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * 9P Client 4 * 9P Client
5 * 5 *
6 * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
6 * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> 7 * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -25,6 +26,7 @@
25#include <linux/module.h> 26#include <linux/module.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/poll.h>
28#include <linux/idr.h> 30#include <linux/idr.h>
29#include <linux/mutex.h> 31#include <linux/mutex.h>
30#include <linux/sched.h> 32#include <linux/sched.h>
@@ -32,15 +34,97 @@
32#include <net/9p/9p.h> 34#include <net/9p/9p.h>
33#include <linux/parser.h> 35#include <linux/parser.h>
34#include <net/9p/transport.h> 36#include <net/9p/transport.h>
35#include <net/9p/conn.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
38static struct p9_fid *p9_fid_create(struct p9_client *clnt); 39static struct p9_fid *p9_fid_create(struct p9_client *clnt);
39static void p9_fid_destroy(struct p9_fid *fid); 40static void p9_fid_destroy(struct p9_fid *fid);
40static struct p9_stat *p9_clone_stat(struct p9_stat *st, int dotu); 41static struct p9_stat *p9_clone_stat(struct p9_stat *st, int dotu);
41 42
42struct p9_client *p9_client_create(struct p9_trans *trans, int msize, 43/*
43 int dotu) 44 * Client Option Parsing (code inspired by NFS code)
45 * - a little lazy - parse all client options
46 */
47
48enum {
49 Opt_msize,
50 Opt_trans,
51 Opt_legacy,
52 Opt_err,
53};
54
55static match_table_t tokens = {
56 {Opt_msize, "msize=%u"},
57 {Opt_legacy, "noextend"},
58 {Opt_trans, "trans=%s"},
59 {Opt_err, NULL},
60};
61
62/**
63 * v9fs_parse_options - parse mount options into session structure
64 * @options: options string passed from mount
65 * @v9ses: existing v9fs session information
66 *
67 */
68
69static void parse_opts(char *options, struct p9_client *clnt)
70{
71 char *p;
72 substring_t args[MAX_OPT_ARGS];
73 int option;
74 int ret;
75
76 clnt->trans_mod = v9fs_default_trans();
77 clnt->dotu = 1;
78 clnt->msize = 8192;
79
80 if (!options)
81 return;
82
83 while ((p = strsep(&options, ",")) != NULL) {
84 int token;
85 if (!*p)
86 continue;
87 token = match_token(p, tokens, args);
88 if (token < Opt_trans) {
89 ret = match_int(&args[0], &option);
90 if (ret < 0) {
91 P9_DPRINTK(P9_DEBUG_ERROR,
92 "integer field, but no integer?\n");
93 continue;
94 }
95 }
96 switch (token) {
97 case Opt_msize:
98 clnt->msize = option;
99 break;
100 case Opt_trans:
101 clnt->trans_mod = v9fs_match_trans(&args[0]);
102 break;
103 case Opt_legacy:
104 clnt->dotu = 0;
105 break;
106 default:
107 continue;
108 }
109 }
110}
111
112
113/**
114 * p9_client_rpc - sends 9P request and waits until a response is available.
115 * The function can be interrupted.
116 * @c: client data
117 * @tc: request to be sent
118 * @rc: pointer where a pointer to the response is stored
119 */
120int
121p9_client_rpc(struct p9_client *c, struct p9_fcall *tc,
122 struct p9_fcall **rc)
123{
124 return c->trans->rpc(c->trans, tc, rc);
125}
126
127struct p9_client *p9_client_create(const char *dev_name, char *options)
44{ 128{
45 int err, n; 129 int err, n;
46 struct p9_client *clnt; 130 struct p9_client *clnt;
@@ -54,12 +138,7 @@ struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
54 if (!clnt) 138 if (!clnt)
55 return ERR_PTR(-ENOMEM); 139 return ERR_PTR(-ENOMEM);
56 140
57 P9_DPRINTK(P9_DEBUG_9P, "clnt %p trans %p msize %d dotu %d\n",
58 clnt, trans, msize, dotu);
59 spin_lock_init(&clnt->lock); 141 spin_lock_init(&clnt->lock);
60 clnt->trans = trans;
61 clnt->msize = msize;
62 clnt->dotu = dotu;
63 INIT_LIST_HEAD(&clnt->fidlist); 142 INIT_LIST_HEAD(&clnt->fidlist);
64 clnt->fidpool = p9_idpool_create(); 143 clnt->fidpool = p9_idpool_create();
65 if (!clnt->fidpool) { 144 if (!clnt->fidpool) {
@@ -68,13 +147,29 @@ struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
68 goto error; 147 goto error;
69 } 148 }
70 149
71 clnt->conn = p9_conn_create(clnt->trans, clnt->msize, &clnt->dotu); 150 parse_opts(options, clnt);
72 if (IS_ERR(clnt->conn)) { 151 if (clnt->trans_mod == NULL) {
73 err = PTR_ERR(clnt->conn); 152 err = -EPROTONOSUPPORT;
74 clnt->conn = NULL; 153 P9_DPRINTK(P9_DEBUG_ERROR,
154 "No transport defined or default transport\n");
75 goto error; 155 goto error;
76 } 156 }
77 157
158 P9_DPRINTK(P9_DEBUG_9P, "clnt %p trans %p msize %d dotu %d\n",
159 clnt, clnt->trans_mod, clnt->msize, clnt->dotu);
160
161
162 clnt->trans = clnt->trans_mod->create(dev_name, options, clnt->msize,
163 clnt->dotu);
164 if (IS_ERR(clnt->trans)) {
165 err = PTR_ERR(clnt->trans);
166 clnt->trans = NULL;
167 goto error;
168 }
169
170 if ((clnt->msize+P9_IOHDRSZ) > clnt->trans_mod->maxsize)
171 clnt->msize = clnt->trans_mod->maxsize-P9_IOHDRSZ;
172
78 tc = p9_create_tversion(clnt->msize, clnt->dotu?"9P2000.u":"9P2000"); 173 tc = p9_create_tversion(clnt->msize, clnt->dotu?"9P2000.u":"9P2000");
79 if (IS_ERR(tc)) { 174 if (IS_ERR(tc)) {
80 err = PTR_ERR(tc); 175 err = PTR_ERR(tc);
@@ -82,7 +177,7 @@ struct p9_client *p9_client_create(struct p9_trans *trans, int msize,
82 goto error; 177 goto error;
83 } 178 }
84 179
85 err = p9_conn_rpc(clnt->conn, tc, &rc); 180 err = p9_client_rpc(clnt, tc, &rc);
86 if (err) 181 if (err)
87 goto error; 182 goto error;
88 183
@@ -117,10 +212,6 @@ void p9_client_destroy(struct p9_client *clnt)
117 struct p9_fid *fid, *fidptr; 212 struct p9_fid *fid, *fidptr;
118 213
119 P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt); 214 P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
120 if (clnt->conn) {
121 p9_conn_destroy(clnt->conn);
122 clnt->conn = NULL;
123 }
124 215
125 if (clnt->trans) { 216 if (clnt->trans) {
126 clnt->trans->close(clnt->trans); 217 clnt->trans->close(clnt->trans);
@@ -142,7 +233,6 @@ void p9_client_disconnect(struct p9_client *clnt)
142{ 233{
143 P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt); 234 P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
144 clnt->trans->status = Disconnected; 235 clnt->trans->status = Disconnected;
145 p9_conn_cancel(clnt->conn, -EIO);
146} 236}
147EXPORT_SYMBOL(p9_client_disconnect); 237EXPORT_SYMBOL(p9_client_disconnect);
148 238
@@ -174,7 +264,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
174 goto error; 264 goto error;
175 } 265 }
176 266
177 err = p9_conn_rpc(clnt->conn, tc, &rc); 267 err = p9_client_rpc(clnt, tc, &rc);
178 if (err) 268 if (err)
179 goto error; 269 goto error;
180 270
@@ -219,7 +309,7 @@ struct p9_fid *p9_client_auth(struct p9_client *clnt, char *uname,
219 goto error; 309 goto error;
220 } 310 }
221 311
222 err = p9_conn_rpc(clnt->conn, tc, &rc); 312 err = p9_client_rpc(clnt, tc, &rc);
223 if (err) 313 if (err)
224 goto error; 314 goto error;
225 315
@@ -270,7 +360,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
270 goto error; 360 goto error;
271 } 361 }
272 362
273 err = p9_conn_rpc(clnt->conn, tc, &rc); 363 err = p9_client_rpc(clnt, tc, &rc);
274 if (err) { 364 if (err) {
275 if (rc && rc->id == P9_RWALK) 365 if (rc && rc->id == P9_RWALK)
276 goto clunk_fid; 366 goto clunk_fid;
@@ -305,7 +395,7 @@ clunk_fid:
305 goto error; 395 goto error;
306 } 396 }
307 397
308 p9_conn_rpc(clnt->conn, tc, &rc); 398 p9_client_rpc(clnt, tc, &rc);
309 399
310error: 400error:
311 kfree(tc); 401 kfree(tc);
@@ -339,7 +429,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
339 goto done; 429 goto done;
340 } 430 }
341 431
342 err = p9_conn_rpc(clnt->conn, tc, &rc); 432 err = p9_client_rpc(clnt, tc, &rc);
343 if (err) 433 if (err)
344 goto done; 434 goto done;
345 435
@@ -378,7 +468,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
378 goto done; 468 goto done;
379 } 469 }
380 470
381 err = p9_conn_rpc(clnt->conn, tc, &rc); 471 err = p9_client_rpc(clnt, tc, &rc);
382 if (err) 472 if (err)
383 goto done; 473 goto done;
384 474
@@ -411,7 +501,7 @@ int p9_client_clunk(struct p9_fid *fid)
411 goto done; 501 goto done;
412 } 502 }
413 503
414 err = p9_conn_rpc(clnt->conn, tc, &rc); 504 err = p9_client_rpc(clnt, tc, &rc);
415 if (err) 505 if (err)
416 goto done; 506 goto done;
417 507
@@ -443,7 +533,7 @@ int p9_client_remove(struct p9_fid *fid)
443 goto done; 533 goto done;
444 } 534 }
445 535
446 err = p9_conn_rpc(clnt->conn, tc, &rc); 536 err = p9_client_rpc(clnt, tc, &rc);
447 if (err) 537 if (err)
448 goto done; 538 goto done;
449 539
@@ -485,7 +575,7 @@ int p9_client_read(struct p9_fid *fid, char *data, u64 offset, u32 count)
485 goto error; 575 goto error;
486 } 576 }
487 577
488 err = p9_conn_rpc(clnt->conn, tc, &rc); 578 err = p9_client_rpc(clnt, tc, &rc);
489 if (err) 579 if (err)
490 goto error; 580 goto error;
491 581
@@ -542,7 +632,7 @@ int p9_client_write(struct p9_fid *fid, char *data, u64 offset, u32 count)
542 goto error; 632 goto error;
543 } 633 }
544 634
545 err = p9_conn_rpc(clnt->conn, tc, &rc); 635 err = p9_client_rpc(clnt, tc, &rc);
546 if (err) 636 if (err)
547 goto error; 637 goto error;
548 638
@@ -596,7 +686,7 @@ p9_client_uread(struct p9_fid *fid, char __user *data, u64 offset, u32 count)
596 goto error; 686 goto error;
597 } 687 }
598 688
599 err = p9_conn_rpc(clnt->conn, tc, &rc); 689 err = p9_client_rpc(clnt, tc, &rc);
600 if (err) 690 if (err)
601 goto error; 691 goto error;
602 692
@@ -660,7 +750,7 @@ p9_client_uwrite(struct p9_fid *fid, const char __user *data, u64 offset,
660 goto error; 750 goto error;
661 } 751 }
662 752
663 err = p9_conn_rpc(clnt->conn, tc, &rc); 753 err = p9_client_rpc(clnt, tc, &rc);
664 if (err) 754 if (err)
665 goto error; 755 goto error;
666 756
@@ -731,7 +821,7 @@ struct p9_stat *p9_client_stat(struct p9_fid *fid)
731 goto error; 821 goto error;
732 } 822 }
733 823
734 err = p9_conn_rpc(clnt->conn, tc, &rc); 824 err = p9_client_rpc(clnt, tc, &rc);
735 if (err) 825 if (err)
736 goto error; 826 goto error;
737 827
@@ -773,7 +863,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
773 goto done; 863 goto done;
774 } 864 }
775 865
776 err = p9_conn_rpc(clnt->conn, tc, &rc); 866 err = p9_client_rpc(clnt, tc, &rc);
777 867
778done: 868done:
779 kfree(tc); 869 kfree(tc);
@@ -830,7 +920,7 @@ struct p9_stat *p9_client_dirread(struct p9_fid *fid, u64 offset)
830 goto error; 920 goto error;
831 } 921 }
832 922
833 err = p9_conn_rpc(clnt->conn, tc, &rc); 923 err = p9_client_rpc(clnt, tc, &rc);
834 if (err) 924 if (err)
835 goto error; 925 goto error;
836 926
@@ -901,16 +991,21 @@ static struct p9_stat *p9_clone_stat(struct p9_stat *st, int dotu)
901 memmove(ret, st, sizeof(struct p9_stat)); 991 memmove(ret, st, sizeof(struct p9_stat));
902 p = ((char *) ret) + sizeof(struct p9_stat); 992 p = ((char *) ret) + sizeof(struct p9_stat);
903 memmove(p, st->name.str, st->name.len); 993 memmove(p, st->name.str, st->name.len);
994 ret->name.str = p;
904 p += st->name.len; 995 p += st->name.len;
905 memmove(p, st->uid.str, st->uid.len); 996 memmove(p, st->uid.str, st->uid.len);
997 ret->uid.str = p;
906 p += st->uid.len; 998 p += st->uid.len;
907 memmove(p, st->gid.str, st->gid.len); 999 memmove(p, st->gid.str, st->gid.len);
1000 ret->gid.str = p;
908 p += st->gid.len; 1001 p += st->gid.len;
909 memmove(p, st->muid.str, st->muid.len); 1002 memmove(p, st->muid.str, st->muid.len);
1003 ret->muid.str = p;
910 p += st->muid.len; 1004 p += st->muid.len;
911 1005
912 if (dotu) { 1006 if (dotu) {
913 memmove(p, st->extension.str, st->extension.len); 1007 memmove(p, st->extension.str, st->extension.len);
1008 ret->extension.str = p;
914 p += st->extension.len; 1009 p += st->extension.len;
915 } 1010 }
916 1011
diff --git a/net/9p/fcprint.c b/net/9p/fcprint.c
index b1ae8ec57d54..40244fbd9b0d 100644
--- a/net/9p/fcprint.c
+++ b/net/9p/fcprint.c
@@ -347,12 +347,12 @@ p9_printfcall(char *buf, int buflen, struct p9_fcall *fc, int extended)
347 347
348 return ret; 348 return ret;
349} 349}
350
351#else 350#else
352int 351int
353p9_printfcall(char *buf, int buflen, struct p9_fcall *fc, int extended) 352p9_printfcall(char *buf, int buflen, struct p9_fcall *fc, int extended)
354{ 353{
355 return 0; 354 return 0;
356} 355}
357EXPORT_SYMBOL(p9_printfcall);
358#endif /* CONFIG_NET_9P_DEBUG */ 356#endif /* CONFIG_NET_9P_DEBUG */
357EXPORT_SYMBOL(p9_printfcall);
358
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 8f9763a9dc12..c285aab2af04 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -106,15 +106,10 @@ EXPORT_SYMBOL(v9fs_default_trans);
106 */ 106 */
107static int __init init_p9(void) 107static int __init init_p9(void)
108{ 108{
109 int ret; 109 int ret = 0;
110 110
111 p9_error_init(); 111 p9_error_init();
112 printk(KERN_INFO "Installing 9P2000 support\n"); 112 printk(KERN_INFO "Installing 9P2000 support\n");
113 ret = p9_mux_global_init();
114 if (ret) {
115 printk(KERN_WARNING "9p: starting mux failed\n");
116 return ret;
117 }
118 113
119 return ret; 114 return ret;
120} 115}
@@ -126,7 +121,7 @@ static int __init init_p9(void)
126 121
127static void __exit exit_p9(void) 122static void __exit exit_p9(void)
128{ 123{
129 p9_mux_global_exit(); 124 printk(KERN_INFO "Unloading 9P2000 support\n");
130} 125}
131 126
132module_init(init_p9) 127module_init(init_p9)
diff --git a/net/9p/mux.c b/net/9p/mux.c
deleted file mode 100644
index c9f0805048e4..000000000000
--- a/net/9p/mux.c
+++ /dev/null
@@ -1,1060 +0,0 @@
1/*
2 * net/9p/mux.c
3 *
4 * Protocol Multiplexer
5 *
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/errno.h>
28#include <linux/fs.h>
29#include <linux/poll.h>
30#include <linux/kthread.h>
31#include <linux/idr.h>
32#include <linux/mutex.h>
33#include <net/9p/9p.h>
34#include <linux/parser.h>
35#include <net/9p/transport.h>
36#include <net/9p/conn.h>
37
38#define ERREQFLUSH 1
39#define SCHED_TIMEOUT 10
40#define MAXPOLLWADDR 2
41
42enum {
43 Rworksched = 1, /* read work scheduled or running */
44 Rpending = 2, /* can read */
45 Wworksched = 4, /* write work scheduled or running */
46 Wpending = 8, /* can write */
47};
48
49enum {
50 None,
51 Flushing,
52 Flushed,
53};
54
55struct p9_mux_poll_task;
56
57struct p9_req {
58 spinlock_t lock; /* protect request structure */
59 int tag;
60 struct p9_fcall *tcall;
61 struct p9_fcall *rcall;
62 int err;
63 p9_conn_req_callback cb;
64 void *cba;
65 int flush;
66 struct list_head req_list;
67};
68
69struct p9_conn {
70 spinlock_t lock; /* protect lock structure */
71 struct list_head mux_list;
72 struct p9_mux_poll_task *poll_task;
73 int msize;
74 unsigned char *extended;
75 struct p9_trans *trans;
76 struct p9_idpool *tagpool;
77 int err;
78 wait_queue_head_t equeue;
79 struct list_head req_list;
80 struct list_head unsent_req_list;
81 struct p9_fcall *rcall;
82 int rpos;
83 char *rbuf;
84 int wpos;
85 int wsize;
86 char *wbuf;
87 wait_queue_t poll_wait[MAXPOLLWADDR];
88 wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
89 poll_table pt;
90 struct work_struct rq;
91 struct work_struct wq;
92 unsigned long wsched;
93};
94
95struct p9_mux_poll_task {
96 struct task_struct *task;
97 struct list_head mux_list;
98 int muxnum;
99};
100
101struct p9_mux_rpc {
102 struct p9_conn *m;
103 int err;
104 struct p9_fcall *tcall;
105 struct p9_fcall *rcall;
106 wait_queue_head_t wqueue;
107};
108
109static int p9_poll_proc(void *);
110static void p9_read_work(struct work_struct *work);
111static void p9_write_work(struct work_struct *work);
112static void p9_pollwait(struct file *filp, wait_queue_head_t *wait_address,
113 poll_table * p);
114static u16 p9_mux_get_tag(struct p9_conn *);
115static void p9_mux_put_tag(struct p9_conn *, u16);
116
117static DEFINE_MUTEX(p9_mux_task_lock);
118static struct workqueue_struct *p9_mux_wq;
119
120static int p9_mux_num;
121static int p9_mux_poll_task_num;
122static struct p9_mux_poll_task p9_mux_poll_tasks[100];
123
124int p9_mux_global_init(void)
125{
126 int i;
127
128 for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++)
129 p9_mux_poll_tasks[i].task = NULL;
130
131 p9_mux_wq = create_workqueue("v9fs");
132 if (!p9_mux_wq) {
133 printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n");
134 return -ENOMEM;
135 }
136
137 return 0;
138}
139
140void p9_mux_global_exit(void)
141{
142 destroy_workqueue(p9_mux_wq);
143}
144
145/**
146 * p9_mux_calc_poll_procs - calculates the number of polling procs
147 * based on the number of mounted v9fs filesystems.
148 *
149 * The current implementation returns sqrt of the number of mounts.
150 */
151static int p9_mux_calc_poll_procs(int muxnum)
152{
153 int n;
154
155 if (p9_mux_poll_task_num)
156 n = muxnum / p9_mux_poll_task_num +
157 (muxnum % p9_mux_poll_task_num ? 1 : 0);
158 else
159 n = 1;
160
161 if (n > ARRAY_SIZE(p9_mux_poll_tasks))
162 n = ARRAY_SIZE(p9_mux_poll_tasks);
163
164 return n;
165}
166
167static int p9_mux_poll_start(struct p9_conn *m)
168{
169 int i, n;
170 struct p9_mux_poll_task *vpt, *vptlast;
171 struct task_struct *pproc;
172
173 P9_DPRINTK(P9_DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, p9_mux_num,
174 p9_mux_poll_task_num);
175 mutex_lock(&p9_mux_task_lock);
176
177 n = p9_mux_calc_poll_procs(p9_mux_num + 1);
178 if (n > p9_mux_poll_task_num) {
179 for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
180 if (p9_mux_poll_tasks[i].task == NULL) {
181 vpt = &p9_mux_poll_tasks[i];
182 P9_DPRINTK(P9_DEBUG_MUX, "create proc %p\n",
183 vpt);
184 pproc = kthread_create(p9_poll_proc, vpt,
185 "v9fs-poll");
186
187 if (!IS_ERR(pproc)) {
188 vpt->task = pproc;
189 INIT_LIST_HEAD(&vpt->mux_list);
190 vpt->muxnum = 0;
191 p9_mux_poll_task_num++;
192 wake_up_process(vpt->task);
193 }
194 break;
195 }
196 }
197
198 if (i >= ARRAY_SIZE(p9_mux_poll_tasks))
199 P9_DPRINTK(P9_DEBUG_ERROR,
200 "warning: no free poll slots\n");
201 }
202
203 n = (p9_mux_num + 1) / p9_mux_poll_task_num +
204 ((p9_mux_num + 1) % p9_mux_poll_task_num ? 1 : 0);
205
206 vptlast = NULL;
207 for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
208 vpt = &p9_mux_poll_tasks[i];
209 if (vpt->task != NULL) {
210 vptlast = vpt;
211 if (vpt->muxnum < n) {
212 P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
213 list_add(&m->mux_list, &vpt->mux_list);
214 vpt->muxnum++;
215 m->poll_task = vpt;
216 memset(&m->poll_waddr, 0,
217 sizeof(m->poll_waddr));
218 init_poll_funcptr(&m->pt, p9_pollwait);
219 break;
220 }
221 }
222 }
223
224 if (i >= ARRAY_SIZE(p9_mux_poll_tasks)) {
225 if (vptlast == NULL) {
226 mutex_unlock(&p9_mux_task_lock);
227 return -ENOMEM;
228 }
229
230 P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
231 list_add(&m->mux_list, &vptlast->mux_list);
232 vptlast->muxnum++;
233 m->poll_task = vptlast;
234 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
235 init_poll_funcptr(&m->pt, p9_pollwait);
236 }
237
238 p9_mux_num++;
239 mutex_unlock(&p9_mux_task_lock);
240
241 return 0;
242}
243
244static void p9_mux_poll_stop(struct p9_conn *m)
245{
246 int i;
247 struct p9_mux_poll_task *vpt;
248
249 mutex_lock(&p9_mux_task_lock);
250 vpt = m->poll_task;
251 list_del(&m->mux_list);
252 for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
253 if (m->poll_waddr[i] != NULL) {
254 remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
255 m->poll_waddr[i] = NULL;
256 }
257 }
258 vpt->muxnum--;
259 if (!vpt->muxnum) {
260 P9_DPRINTK(P9_DEBUG_MUX, "destroy proc %p\n", vpt);
261 kthread_stop(vpt->task);
262 vpt->task = NULL;
263 p9_mux_poll_task_num--;
264 }
265 p9_mux_num--;
266 mutex_unlock(&p9_mux_task_lock);
267}
268
269/**
270 * p9_conn_create - allocate and initialize the per-session mux data
271 * Creates the polling task if this is the first session.
272 *
273 * @trans - transport structure
274 * @msize - maximum message size
275 * @extended - pointer to the extended flag
276 */
277struct p9_conn *p9_conn_create(struct p9_trans *trans, int msize,
278 unsigned char *extended)
279{
280 int i, n;
281 struct p9_conn *m, *mtmp;
282
283 P9_DPRINTK(P9_DEBUG_MUX, "transport %p msize %d\n", trans, msize);
284 m = kmalloc(sizeof(struct p9_conn), GFP_KERNEL);
285 if (!m)
286 return ERR_PTR(-ENOMEM);
287
288 spin_lock_init(&m->lock);
289 INIT_LIST_HEAD(&m->mux_list);
290 m->msize = msize;
291 m->extended = extended;
292 m->trans = trans;
293 m->tagpool = p9_idpool_create();
294 if (IS_ERR(m->tagpool)) {
295 mtmp = ERR_PTR(-ENOMEM);
296 kfree(m);
297 return mtmp;
298 }
299
300 m->err = 0;
301 init_waitqueue_head(&m->equeue);
302 INIT_LIST_HEAD(&m->req_list);
303 INIT_LIST_HEAD(&m->unsent_req_list);
304 m->rcall = NULL;
305 m->rpos = 0;
306 m->rbuf = NULL;
307 m->wpos = m->wsize = 0;
308 m->wbuf = NULL;
309 INIT_WORK(&m->rq, p9_read_work);
310 INIT_WORK(&m->wq, p9_write_work);
311 m->wsched = 0;
312 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
313 m->poll_task = NULL;
314 n = p9_mux_poll_start(m);
315 if (n) {
316 kfree(m);
317 return ERR_PTR(n);
318 }
319
320 n = trans->poll(trans, &m->pt);
321 if (n & POLLIN) {
322 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
323 set_bit(Rpending, &m->wsched);
324 }
325
326 if (n & POLLOUT) {
327 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
328 set_bit(Wpending, &m->wsched);
329 }
330
331 for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
332 if (IS_ERR(m->poll_waddr[i])) {
333 p9_mux_poll_stop(m);
334 mtmp = (void *)m->poll_waddr; /* the error code */
335 kfree(m);
336 m = mtmp;
337 break;
338 }
339 }
340
341 return m;
342}
343EXPORT_SYMBOL(p9_conn_create);
344
345/**
346 * p9_mux_destroy - cancels all pending requests and frees mux resources
347 */
348void p9_conn_destroy(struct p9_conn *m)
349{
350 P9_DPRINTK(P9_DEBUG_MUX, "mux %p prev %p next %p\n", m,
351 m->mux_list.prev, m->mux_list.next);
352 p9_conn_cancel(m, -ECONNRESET);
353
354 if (!list_empty(&m->req_list)) {
355 /* wait until all processes waiting on this session exit */
356 P9_DPRINTK(P9_DEBUG_MUX,
357 "mux %p waiting for empty request queue\n", m);
358 wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
359 P9_DPRINTK(P9_DEBUG_MUX, "mux %p request queue empty: %d\n", m,
360 list_empty(&m->req_list));
361 }
362
363 p9_mux_poll_stop(m);
364 m->trans = NULL;
365 p9_idpool_destroy(m->tagpool);
366 kfree(m);
367}
368EXPORT_SYMBOL(p9_conn_destroy);
369
370/**
371 * p9_pollwait - called by files poll operation to add v9fs-poll task
372 * to files wait queue
373 */
374static void
375p9_pollwait(struct file *filp, wait_queue_head_t *wait_address,
376 poll_table * p)
377{
378 int i;
379 struct p9_conn *m;
380
381 m = container_of(p, struct p9_conn, pt);
382 for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
383 if (m->poll_waddr[i] == NULL)
384 break;
385
386 if (i >= ARRAY_SIZE(m->poll_waddr)) {
387 P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n");
388 return;
389 }
390
391 m->poll_waddr[i] = wait_address;
392
393 if (!wait_address) {
394 P9_DPRINTK(P9_DEBUG_ERROR, "no wait_address\n");
395 m->poll_waddr[i] = ERR_PTR(-EIO);
396 return;
397 }
398
399 init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
400 add_wait_queue(wait_address, &m->poll_wait[i]);
401}
402
403/**
404 * p9_poll_mux - polls a mux and schedules read or write works if necessary
405 */
406static void p9_poll_mux(struct p9_conn *m)
407{
408 int n;
409
410 if (m->err < 0)
411 return;
412
413 n = m->trans->poll(m->trans, NULL);
414 if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
415 P9_DPRINTK(P9_DEBUG_MUX, "error mux %p err %d\n", m, n);
416 if (n >= 0)
417 n = -ECONNRESET;
418 p9_conn_cancel(m, n);
419 }
420
421 if (n & POLLIN) {
422 set_bit(Rpending, &m->wsched);
423 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
424 if (!test_and_set_bit(Rworksched, &m->wsched)) {
425 P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
426 queue_work(p9_mux_wq, &m->rq);
427 }
428 }
429
430 if (n & POLLOUT) {
431 set_bit(Wpending, &m->wsched);
432 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
433 if ((m->wsize || !list_empty(&m->unsent_req_list))
434 && !test_and_set_bit(Wworksched, &m->wsched)) {
435 P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
436 queue_work(p9_mux_wq, &m->wq);
437 }
438 }
439}
440
441/**
442 * p9_poll_proc - polls all v9fs transports for new events and queues
443 * the appropriate work to the work queue
444 */
445static int p9_poll_proc(void *a)
446{
447 struct p9_conn *m, *mtmp;
448 struct p9_mux_poll_task *vpt;
449
450 vpt = a;
451 P9_DPRINTK(P9_DEBUG_MUX, "start %p %p\n", current, vpt);
452 while (!kthread_should_stop()) {
453 set_current_state(TASK_INTERRUPTIBLE);
454
455 list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
456 p9_poll_mux(m);
457 }
458
459 P9_DPRINTK(P9_DEBUG_MUX, "sleeping...\n");
460 schedule_timeout(SCHED_TIMEOUT * HZ);
461 }
462
463 __set_current_state(TASK_RUNNING);
464 P9_DPRINTK(P9_DEBUG_MUX, "finish\n");
465 return 0;
466}
467
468/**
469 * p9_write_work - called when a transport can send some data
470 */
471static void p9_write_work(struct work_struct *work)
472{
473 int n, err;
474 struct p9_conn *m;
475 struct p9_req *req;
476
477 m = container_of(work, struct p9_conn, wq);
478
479 if (m->err < 0) {
480 clear_bit(Wworksched, &m->wsched);
481 return;
482 }
483
484 if (!m->wsize) {
485 if (list_empty(&m->unsent_req_list)) {
486 clear_bit(Wworksched, &m->wsched);
487 return;
488 }
489
490 spin_lock(&m->lock);
491again:
492 req = list_entry(m->unsent_req_list.next, struct p9_req,
493 req_list);
494 list_move_tail(&req->req_list, &m->req_list);
495 if (req->err == ERREQFLUSH)
496 goto again;
497
498 m->wbuf = req->tcall->sdata;
499 m->wsize = req->tcall->size;
500 m->wpos = 0;
501 spin_unlock(&m->lock);
502 }
503
504 P9_DPRINTK(P9_DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos,
505 m->wsize);
506 clear_bit(Wpending, &m->wsched);
507 err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
508 P9_DPRINTK(P9_DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
509 if (err == -EAGAIN) {
510 clear_bit(Wworksched, &m->wsched);
511 return;
512 }
513
514 if (err < 0)
515 goto error;
516 else if (err == 0) {
517 err = -EREMOTEIO;
518 goto error;
519 }
520
521 m->wpos += err;
522 if (m->wpos == m->wsize)
523 m->wpos = m->wsize = 0;
524
525 if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
526 if (test_and_clear_bit(Wpending, &m->wsched))
527 n = POLLOUT;
528 else
529 n = m->trans->poll(m->trans, NULL);
530
531 if (n & POLLOUT) {
532 P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
533 queue_work(p9_mux_wq, &m->wq);
534 } else
535 clear_bit(Wworksched, &m->wsched);
536 } else
537 clear_bit(Wworksched, &m->wsched);
538
539 return;
540
541error:
542 p9_conn_cancel(m, err);
543 clear_bit(Wworksched, &m->wsched);
544}
545
546static void process_request(struct p9_conn *m, struct p9_req *req)
547{
548 int ecode;
549 struct p9_str *ename;
550
551 if (!req->err && req->rcall->id == P9_RERROR) {
552 ecode = req->rcall->params.rerror.errno;
553 ename = &req->rcall->params.rerror.error;
554
555 P9_DPRINTK(P9_DEBUG_MUX, "Rerror %.*s\n", ename->len,
556 ename->str);
557
558 if (*m->extended)
559 req->err = -ecode;
560
561 if (!req->err) {
562 req->err = p9_errstr2errno(ename->str, ename->len);
563
564 if (!req->err) { /* string match failed */
565 PRINT_FCALL_ERROR("unknown error", req->rcall);
566 }
567
568 if (!req->err)
569 req->err = -ESERVERFAULT;
570 }
571 } else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
572 P9_DPRINTK(P9_DEBUG_ERROR,
573 "fcall mismatch: expected %d, got %d\n",
574 req->tcall->id + 1, req->rcall->id);
575 if (!req->err)
576 req->err = -EIO;
577 }
578}
579
580/**
581 * p9_read_work - called when there is some data to be read from a transport
582 */
583static void p9_read_work(struct work_struct *work)
584{
585 int n, err;
586 struct p9_conn *m;
587 struct p9_req *req, *rptr, *rreq;
588 struct p9_fcall *rcall;
589 char *rbuf;
590
591 m = container_of(work, struct p9_conn, rq);
592
593 if (m->err < 0)
594 return;
595
596 rcall = NULL;
597 P9_DPRINTK(P9_DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
598
599 if (!m->rcall) {
600 m->rcall =
601 kmalloc(sizeof(struct p9_fcall) + m->msize, GFP_KERNEL);
602 if (!m->rcall) {
603 err = -ENOMEM;
604 goto error;
605 }
606
607 m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
608 m->rpos = 0;
609 }
610
611 clear_bit(Rpending, &m->wsched);
612 err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
613 P9_DPRINTK(P9_DEBUG_MUX, "mux %p got %d bytes\n", m, err);
614 if (err == -EAGAIN) {
615 clear_bit(Rworksched, &m->wsched);
616 return;
617 }
618
619 if (err <= 0)
620 goto error;
621
622 m->rpos += err;
623 while (m->rpos > 4) {
624 n = le32_to_cpu(*(__le32 *) m->rbuf);
625 if (n >= m->msize) {
626 P9_DPRINTK(P9_DEBUG_ERROR,
627 "requested packet size too big: %d\n", n);
628 err = -EIO;
629 goto error;
630 }
631
632 if (m->rpos < n)
633 break;
634
635 err =
636 p9_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended);
637 if (err < 0) {
638 goto error;
639 }
640
641#ifdef CONFIG_NET_9P_DEBUG
642 if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
643 char buf[150];
644
645 p9_printfcall(buf, sizeof(buf), m->rcall,
646 *m->extended);
647 printk(KERN_NOTICE ">>> %p %s\n", m, buf);
648 }
649#endif
650
651 rcall = m->rcall;
652 rbuf = m->rbuf;
653 if (m->rpos > n) {
654 m->rcall = kmalloc(sizeof(struct p9_fcall) + m->msize,
655 GFP_KERNEL);
656 if (!m->rcall) {
657 err = -ENOMEM;
658 goto error;
659 }
660
661 m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
662 memmove(m->rbuf, rbuf + n, m->rpos - n);
663 m->rpos -= n;
664 } else {
665 m->rcall = NULL;
666 m->rbuf = NULL;
667 m->rpos = 0;
668 }
669
670 P9_DPRINTK(P9_DEBUG_MUX, "mux %p fcall id %d tag %d\n", m,
671 rcall->id, rcall->tag);
672
673 req = NULL;
674 spin_lock(&m->lock);
675 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
676 if (rreq->tag == rcall->tag) {
677 req = rreq;
678 if (req->flush != Flushing)
679 list_del(&req->req_list);
680 break;
681 }
682 }
683 spin_unlock(&m->lock);
684
685 if (req) {
686 req->rcall = rcall;
687 process_request(m, req);
688
689 if (req->flush != Flushing) {
690 if (req->cb)
691 (*req->cb) (req, req->cba);
692 else
693 kfree(req->rcall);
694
695 wake_up(&m->equeue);
696 }
697 } else {
698 if (err >= 0 && rcall->id != P9_RFLUSH)
699 P9_DPRINTK(P9_DEBUG_ERROR,
700 "unexpected response mux %p id %d tag %d\n",
701 m, rcall->id, rcall->tag);
702 kfree(rcall);
703 }
704 }
705
706 if (!list_empty(&m->req_list)) {
707 if (test_and_clear_bit(Rpending, &m->wsched))
708 n = POLLIN;
709 else
710 n = m->trans->poll(m->trans, NULL);
711
712 if (n & POLLIN) {
713 P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
714 queue_work(p9_mux_wq, &m->rq);
715 } else
716 clear_bit(Rworksched, &m->wsched);
717 } else
718 clear_bit(Rworksched, &m->wsched);
719
720 return;
721
722error:
723 p9_conn_cancel(m, err);
724 clear_bit(Rworksched, &m->wsched);
725}
726
727/**
728 * p9_send_request - send 9P request
729 * The function can sleep until the request is scheduled for sending.
730 * The function can be interrupted. Return from the function is not
731 * a guarantee that the request is sent successfully. Can return errors
732 * that can be retrieved by PTR_ERR macros.
733 *
734 * @m: mux data
735 * @tc: request to be sent
736 * @cb: callback function to call when response is received
737 * @cba: parameter to pass to the callback function
738 */
739static struct p9_req *p9_send_request(struct p9_conn *m,
740 struct p9_fcall *tc,
741 p9_conn_req_callback cb, void *cba)
742{
743 int n;
744 struct p9_req *req;
745
746 P9_DPRINTK(P9_DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
747 tc, tc->id);
748 if (m->err < 0)
749 return ERR_PTR(m->err);
750
751 req = kmalloc(sizeof(struct p9_req), GFP_KERNEL);
752 if (!req)
753 return ERR_PTR(-ENOMEM);
754
755 if (tc->id == P9_TVERSION)
756 n = P9_NOTAG;
757 else
758 n = p9_mux_get_tag(m);
759
760 if (n < 0)
761 return ERR_PTR(-ENOMEM);
762
763 p9_set_tag(tc, n);
764
765#ifdef CONFIG_NET_9P_DEBUG
766 if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
767 char buf[150];
768
769 p9_printfcall(buf, sizeof(buf), tc, *m->extended);
770 printk(KERN_NOTICE "<<< %p %s\n", m, buf);
771 }
772#endif
773
774 spin_lock_init(&req->lock);
775 req->tag = n;
776 req->tcall = tc;
777 req->rcall = NULL;
778 req->err = 0;
779 req->cb = cb;
780 req->cba = cba;
781 req->flush = None;
782
783 spin_lock(&m->lock);
784 list_add_tail(&req->req_list, &m->unsent_req_list);
785 spin_unlock(&m->lock);
786
787 if (test_and_clear_bit(Wpending, &m->wsched))
788 n = POLLOUT;
789 else
790 n = m->trans->poll(m->trans, NULL);
791
792 if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
793 queue_work(p9_mux_wq, &m->wq);
794
795 return req;
796}
797
798static void p9_mux_free_request(struct p9_conn *m, struct p9_req *req)
799{
800 p9_mux_put_tag(m, req->tag);
801 kfree(req);
802}
803
804static void p9_mux_flush_cb(struct p9_req *freq, void *a)
805{
806 p9_conn_req_callback cb;
807 int tag;
808 struct p9_conn *m;
809 struct p9_req *req, *rreq, *rptr;
810
811 m = a;
812 P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m,
813 freq->tcall, freq->rcall, freq->err,
814 freq->tcall->params.tflush.oldtag);
815
816 spin_lock(&m->lock);
817 cb = NULL;
818 tag = freq->tcall->params.tflush.oldtag;
819 req = NULL;
820 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
821 if (rreq->tag == tag) {
822 req = rreq;
823 list_del(&req->req_list);
824 break;
825 }
826 }
827 spin_unlock(&m->lock);
828
829 if (req) {
830 spin_lock(&req->lock);
831 req->flush = Flushed;
832 spin_unlock(&req->lock);
833
834 if (req->cb)
835 (*req->cb) (req, req->cba);
836 else
837 kfree(req->rcall);
838
839 wake_up(&m->equeue);
840 }
841
842 kfree(freq->tcall);
843 kfree(freq->rcall);
844 p9_mux_free_request(m, freq);
845}
846
847static int
848p9_mux_flush_request(struct p9_conn *m, struct p9_req *req)
849{
850 struct p9_fcall *fc;
851 struct p9_req *rreq, *rptr;
852
853 P9_DPRINTK(P9_DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
854
855 /* if a response was received for a request, do nothing */
856 spin_lock(&req->lock);
857 if (req->rcall || req->err) {
858 spin_unlock(&req->lock);
859 P9_DPRINTK(P9_DEBUG_MUX,
860 "mux %p req %p response already received\n", m, req);
861 return 0;
862 }
863
864 req->flush = Flushing;
865 spin_unlock(&req->lock);
866
867 spin_lock(&m->lock);
868 /* if the request is not sent yet, just remove it from the list */
869 list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) {
870 if (rreq->tag == req->tag) {
871 P9_DPRINTK(P9_DEBUG_MUX,
872 "mux %p req %p request is not sent yet\n", m, req);
873 list_del(&rreq->req_list);
874 req->flush = Flushed;
875 spin_unlock(&m->lock);
876 if (req->cb)
877 (*req->cb) (req, req->cba);
878 return 0;
879 }
880 }
881 spin_unlock(&m->lock);
882
883 clear_thread_flag(TIF_SIGPENDING);
884 fc = p9_create_tflush(req->tag);
885 p9_send_request(m, fc, p9_mux_flush_cb, m);
886 return 1;
887}
888
889static void
890p9_conn_rpc_cb(struct p9_req *req, void *a)
891{
892 struct p9_mux_rpc *r;
893
894 P9_DPRINTK(P9_DEBUG_MUX, "req %p r %p\n", req, a);
895 r = a;
896 r->rcall = req->rcall;
897 r->err = req->err;
898
899 if (req->flush != None && !req->err)
900 r->err = -ERESTARTSYS;
901
902 wake_up(&r->wqueue);
903}
904
905/**
906 * p9_mux_rpc - sends 9P request and waits until a response is available.
907 * The function can be interrupted.
908 * @m: mux data
909 * @tc: request to be sent
910 * @rc: pointer where a pointer to the response is stored
911 */
912int
913p9_conn_rpc(struct p9_conn *m, struct p9_fcall *tc,
914 struct p9_fcall **rc)
915{
916 int err, sigpending;
917 unsigned long flags;
918 struct p9_req *req;
919 struct p9_mux_rpc r;
920
921 r.err = 0;
922 r.tcall = tc;
923 r.rcall = NULL;
924 r.m = m;
925 init_waitqueue_head(&r.wqueue);
926
927 if (rc)
928 *rc = NULL;
929
930 sigpending = 0;
931 if (signal_pending(current)) {
932 sigpending = 1;
933 clear_thread_flag(TIF_SIGPENDING);
934 }
935
936 req = p9_send_request(m, tc, p9_conn_rpc_cb, &r);
937 if (IS_ERR(req)) {
938 err = PTR_ERR(req);
939 P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
940 return err;
941 }
942
943 err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
944 if (r.err < 0)
945 err = r.err;
946
947 if (err == -ERESTARTSYS && m->trans->status == Connected
948 && m->err == 0) {
949 if (p9_mux_flush_request(m, req)) {
950 /* wait until we get response of the flush message */
951 do {
952 clear_thread_flag(TIF_SIGPENDING);
953 err = wait_event_interruptible(r.wqueue,
954 r.rcall || r.err);
955 } while (!r.rcall && !r.err && err == -ERESTARTSYS &&
956 m->trans->status == Connected && !m->err);
957
958 err = -ERESTARTSYS;
959 }
960 sigpending = 1;
961 }
962
963 if (sigpending) {
964 spin_lock_irqsave(&current->sighand->siglock, flags);
965 recalc_sigpending();
966 spin_unlock_irqrestore(&current->sighand->siglock, flags);
967 }
968
969 if (rc)
970 *rc = r.rcall;
971 else
972 kfree(r.rcall);
973
974 p9_mux_free_request(m, req);
975 if (err > 0)
976 err = -EIO;
977
978 return err;
979}
980EXPORT_SYMBOL(p9_conn_rpc);
981
982#ifdef P9_NONBLOCK
983/**
984 * p9_conn_rpcnb - sends 9P request without waiting for response.
985 * @m: mux data
986 * @tc: request to be sent
987 * @cb: callback function to be called when response arrives
988 * @cba: value to pass to the callback function
989 */
990int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
991 p9_conn_req_callback cb, void *a)
992{
993 int err;
994 struct p9_req *req;
995
996 req = p9_send_request(m, tc, cb, a);
997 if (IS_ERR(req)) {
998 err = PTR_ERR(req);
999 P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
1000 return PTR_ERR(req);
1001 }
1002
1003 P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
1004 return 0;
1005}
1006EXPORT_SYMBOL(p9_conn_rpcnb);
1007#endif /* P9_NONBLOCK */
1008
1009/**
1010 * p9_conn_cancel - cancel all pending requests with error
1011 * @m: mux data
1012 * @err: error code
1013 */
1014void p9_conn_cancel(struct p9_conn *m, int err)
1015{
1016 struct p9_req *req, *rtmp;
1017 LIST_HEAD(cancel_list);
1018
1019 P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
1020 m->err = err;
1021 spin_lock(&m->lock);
1022 list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
1023 list_move(&req->req_list, &cancel_list);
1024 }
1025 list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
1026 list_move(&req->req_list, &cancel_list);
1027 }
1028 spin_unlock(&m->lock);
1029
1030 list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
1031 list_del(&req->req_list);
1032 if (!req->err)
1033 req->err = err;
1034
1035 if (req->cb)
1036 (*req->cb) (req, req->cba);
1037 else
1038 kfree(req->rcall);
1039 }
1040
1041 wake_up(&m->equeue);
1042}
1043EXPORT_SYMBOL(p9_conn_cancel);
1044
1045static u16 p9_mux_get_tag(struct p9_conn *m)
1046{
1047 int tag;
1048
1049 tag = p9_idpool_get(m->tagpool);
1050 if (tag < 0)
1051 return P9_NOTAG;
1052 else
1053 return (u16) tag;
1054}
1055
1056static void p9_mux_put_tag(struct p9_conn *m, u16 tag)
1057{
1058 if (tag != P9_NOTAG && p9_idpool_check(tag, m->tagpool))
1059 p9_idpool_put(tag, m->tagpool);
1060}
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 62332ed9da4a..1aa9d5175398 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> 6 * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> 7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
8 * Copyright (C) 2004-2007 by Eric Van Hensbergen <ericvh@gmail.com> 8 * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
9 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> 9 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
10 * 10 *
11 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/net.h> 30#include <linux/net.h>
31#include <linux/ipv6.h> 31#include <linux/ipv6.h>
32#include <linux/kthread.h>
32#include <linux/errno.h> 33#include <linux/errno.h>
33#include <linux/kernel.h> 34#include <linux/kernel.h>
34#include <linux/un.h> 35#include <linux/un.h>
@@ -42,7 +43,9 @@
42 43
43#define P9_PORT 564 44#define P9_PORT 564
44#define MAX_SOCK_BUF (64*1024) 45#define MAX_SOCK_BUF (64*1024)
45 46#define ERREQFLUSH 1
47#define SCHED_TIMEOUT 10
48#define MAXPOLLWADDR 2
46 49
47struct p9_fd_opts { 50struct p9_fd_opts {
48 int rfd; 51 int rfd;
@@ -53,6 +56,7 @@ struct p9_fd_opts {
53struct p9_trans_fd { 56struct p9_trans_fd {
54 struct file *rd; 57 struct file *rd;
55 struct file *wr; 58 struct file *wr;
59 struct p9_conn *conn;
56}; 60};
57 61
58/* 62/*
@@ -72,6 +76,1028 @@ static match_table_t tokens = {
72 {Opt_err, NULL}, 76 {Opt_err, NULL},
73}; 77};
74 78
79enum {
80 Rworksched = 1, /* read work scheduled or running */
81 Rpending = 2, /* can read */
82 Wworksched = 4, /* write work scheduled or running */
83 Wpending = 8, /* can write */
84};
85
86enum {
87 None,
88 Flushing,
89 Flushed,
90};
91
92struct p9_req;
93
94typedef void (*p9_conn_req_callback)(struct p9_req *req, void *a);
95struct p9_req {
96 spinlock_t lock; /* protect request structure */
97 int tag;
98 struct p9_fcall *tcall;
99 struct p9_fcall *rcall;
100 int err;
101 p9_conn_req_callback cb;
102 void *cba;
103 int flush;
104 struct list_head req_list;
105};
106
107struct p9_mux_poll_task;
108
109struct p9_conn {
110 spinlock_t lock; /* protect lock structure */
111 struct list_head mux_list;
112 struct p9_mux_poll_task *poll_task;
113 int msize;
114 unsigned char extended;
115 struct p9_trans *trans;
116 struct p9_idpool *tagpool;
117 int err;
118 wait_queue_head_t equeue;
119 struct list_head req_list;
120 struct list_head unsent_req_list;
121 struct p9_fcall *rcall;
122 int rpos;
123 char *rbuf;
124 int wpos;
125 int wsize;
126 char *wbuf;
127 wait_queue_t poll_wait[MAXPOLLWADDR];
128 wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
129 poll_table pt;
130 struct work_struct rq;
131 struct work_struct wq;
132 unsigned long wsched;
133};
134
135struct p9_mux_poll_task {
136 struct task_struct *task;
137 struct list_head mux_list;
138 int muxnum;
139};
140
141struct p9_mux_rpc {
142 struct p9_conn *m;
143 int err;
144 struct p9_fcall *tcall;
145 struct p9_fcall *rcall;
146 wait_queue_head_t wqueue;
147};
148
149static int p9_poll_proc(void *);
150static void p9_read_work(struct work_struct *work);
151static void p9_write_work(struct work_struct *work);
152static void p9_pollwait(struct file *filp, wait_queue_head_t *wait_address,
153 poll_table *p);
154static int p9_fd_write(struct p9_trans *trans, void *v, int len);
155static int p9_fd_read(struct p9_trans *trans, void *v, int len);
156
157static DEFINE_MUTEX(p9_mux_task_lock);
158static struct workqueue_struct *p9_mux_wq;
159
160static int p9_mux_num;
161static int p9_mux_poll_task_num;
162static struct p9_mux_poll_task p9_mux_poll_tasks[100];
163
164static void p9_conn_destroy(struct p9_conn *);
165static unsigned int p9_fd_poll(struct p9_trans *trans,
166 struct poll_table_struct *pt);
167
168#ifdef P9_NONBLOCK
169static int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
170 p9_conn_req_callback cb, void *a);
171#endif /* P9_NONBLOCK */
172
173static void p9_conn_cancel(struct p9_conn *m, int err);
174
175static int p9_mux_global_init(void)
176{
177 int i;
178
179 for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++)
180 p9_mux_poll_tasks[i].task = NULL;
181
182 p9_mux_wq = create_workqueue("v9fs");
183 if (!p9_mux_wq) {
184 printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n");
185 return -ENOMEM;
186 }
187
188 return 0;
189}
190
191static u16 p9_mux_get_tag(struct p9_conn *m)
192{
193 int tag;
194
195 tag = p9_idpool_get(m->tagpool);
196 if (tag < 0)
197 return P9_NOTAG;
198 else
199 return (u16) tag;
200}
201
202static void p9_mux_put_tag(struct p9_conn *m, u16 tag)
203{
204 if (tag != P9_NOTAG && p9_idpool_check(tag, m->tagpool))
205 p9_idpool_put(tag, m->tagpool);
206}
207
208/**
209 * p9_mux_calc_poll_procs - calculates the number of polling procs
210 * based on the number of mounted v9fs filesystems.
211 *
212 * The current implementation returns sqrt of the number of mounts.
213 */
214static int p9_mux_calc_poll_procs(int muxnum)
215{
216 int n;
217
218 if (p9_mux_poll_task_num)
219 n = muxnum / p9_mux_poll_task_num +
220 (muxnum % p9_mux_poll_task_num ? 1 : 0);
221 else
222 n = 1;
223
224 if (n > ARRAY_SIZE(p9_mux_poll_tasks))
225 n = ARRAY_SIZE(p9_mux_poll_tasks);
226
227 return n;
228}
229
230static int p9_mux_poll_start(struct p9_conn *m)
231{
232 int i, n;
233 struct p9_mux_poll_task *vpt, *vptlast;
234 struct task_struct *pproc;
235
236 P9_DPRINTK(P9_DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, p9_mux_num,
237 p9_mux_poll_task_num);
238 mutex_lock(&p9_mux_task_lock);
239
240 n = p9_mux_calc_poll_procs(p9_mux_num + 1);
241 if (n > p9_mux_poll_task_num) {
242 for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
243 if (p9_mux_poll_tasks[i].task == NULL) {
244 vpt = &p9_mux_poll_tasks[i];
245 P9_DPRINTK(P9_DEBUG_MUX, "create proc %p\n",
246 vpt);
247 pproc = kthread_create(p9_poll_proc, vpt,
248 "v9fs-poll");
249
250 if (!IS_ERR(pproc)) {
251 vpt->task = pproc;
252 INIT_LIST_HEAD(&vpt->mux_list);
253 vpt->muxnum = 0;
254 p9_mux_poll_task_num++;
255 wake_up_process(vpt->task);
256 }
257 break;
258 }
259 }
260
261 if (i >= ARRAY_SIZE(p9_mux_poll_tasks))
262 P9_DPRINTK(P9_DEBUG_ERROR,
263 "warning: no free poll slots\n");
264 }
265
266 n = (p9_mux_num + 1) / p9_mux_poll_task_num +
267 ((p9_mux_num + 1) % p9_mux_poll_task_num ? 1 : 0);
268
269 vptlast = NULL;
270 for (i = 0; i < ARRAY_SIZE(p9_mux_poll_tasks); i++) {
271 vpt = &p9_mux_poll_tasks[i];
272 if (vpt->task != NULL) {
273 vptlast = vpt;
274 if (vpt->muxnum < n) {
275 P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
276 list_add(&m->mux_list, &vpt->mux_list);
277 vpt->muxnum++;
278 m->poll_task = vpt;
279 memset(&m->poll_waddr, 0,
280 sizeof(m->poll_waddr));
281 init_poll_funcptr(&m->pt, p9_pollwait);
282 break;
283 }
284 }
285 }
286
287 if (i >= ARRAY_SIZE(p9_mux_poll_tasks)) {
288 if (vptlast == NULL) {
289 mutex_unlock(&p9_mux_task_lock);
290 return -ENOMEM;
291 }
292
293 P9_DPRINTK(P9_DEBUG_MUX, "put in proc %d\n", i);
294 list_add(&m->mux_list, &vptlast->mux_list);
295 vptlast->muxnum++;
296 m->poll_task = vptlast;
297 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
298 init_poll_funcptr(&m->pt, p9_pollwait);
299 }
300
301 p9_mux_num++;
302 mutex_unlock(&p9_mux_task_lock);
303
304 return 0;
305}
306
307static void p9_mux_poll_stop(struct p9_conn *m)
308{
309 int i;
310 struct p9_mux_poll_task *vpt;
311
312 mutex_lock(&p9_mux_task_lock);
313 vpt = m->poll_task;
314 list_del(&m->mux_list);
315 for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
316 if (m->poll_waddr[i] != NULL) {
317 remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
318 m->poll_waddr[i] = NULL;
319 }
320 }
321 vpt->muxnum--;
322 if (!vpt->muxnum) {
323 P9_DPRINTK(P9_DEBUG_MUX, "destroy proc %p\n", vpt);
324 kthread_stop(vpt->task);
325 vpt->task = NULL;
326 p9_mux_poll_task_num--;
327 }
328 p9_mux_num--;
329 mutex_unlock(&p9_mux_task_lock);
330}
331
332/**
333 * p9_conn_create - allocate and initialize the per-session mux data
334 * Creates the polling task if this is the first session.
335 *
336 * @trans - transport structure
337 * @msize - maximum message size
338 * @extended - extended flag
339 */
340static struct p9_conn *p9_conn_create(struct p9_trans *trans)
341{
342 int i, n;
343 struct p9_conn *m, *mtmp;
344
345 P9_DPRINTK(P9_DEBUG_MUX, "transport %p msize %d\n", trans,
346 trans->msize);
347 m = kmalloc(sizeof(struct p9_conn), GFP_KERNEL);
348 if (!m)
349 return ERR_PTR(-ENOMEM);
350
351 spin_lock_init(&m->lock);
352 INIT_LIST_HEAD(&m->mux_list);
353 m->msize = trans->msize;
354 m->extended = trans->extended;
355 m->trans = trans;
356 m->tagpool = p9_idpool_create();
357 if (IS_ERR(m->tagpool)) {
358 mtmp = ERR_PTR(-ENOMEM);
359 kfree(m);
360 return mtmp;
361 }
362
363 m->err = 0;
364 init_waitqueue_head(&m->equeue);
365 INIT_LIST_HEAD(&m->req_list);
366 INIT_LIST_HEAD(&m->unsent_req_list);
367 m->rcall = NULL;
368 m->rpos = 0;
369 m->rbuf = NULL;
370 m->wpos = m->wsize = 0;
371 m->wbuf = NULL;
372 INIT_WORK(&m->rq, p9_read_work);
373 INIT_WORK(&m->wq, p9_write_work);
374 m->wsched = 0;
375 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
376 m->poll_task = NULL;
377 n = p9_mux_poll_start(m);
378 if (n) {
379 kfree(m);
380 return ERR_PTR(n);
381 }
382
383 n = p9_fd_poll(trans, &m->pt);
384 if (n & POLLIN) {
385 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
386 set_bit(Rpending, &m->wsched);
387 }
388
389 if (n & POLLOUT) {
390 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
391 set_bit(Wpending, &m->wsched);
392 }
393
394 for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
395 if (IS_ERR(m->poll_waddr[i])) {
396 p9_mux_poll_stop(m);
397 mtmp = (void *)m->poll_waddr; /* the error code */
398 kfree(m);
399 m = mtmp;
400 break;
401 }
402 }
403
404 return m;
405}
406
407/**
408 * p9_mux_destroy - cancels all pending requests and frees mux resources
409 */
410static void p9_conn_destroy(struct p9_conn *m)
411{
412 P9_DPRINTK(P9_DEBUG_MUX, "mux %p prev %p next %p\n", m,
413 m->mux_list.prev, m->mux_list.next);
414 p9_conn_cancel(m, -ECONNRESET);
415
416 if (!list_empty(&m->req_list)) {
417 /* wait until all processes waiting on this session exit */
418 P9_DPRINTK(P9_DEBUG_MUX,
419 "mux %p waiting for empty request queue\n", m);
420 wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
421 P9_DPRINTK(P9_DEBUG_MUX, "mux %p request queue empty: %d\n", m,
422 list_empty(&m->req_list));
423 }
424
425 p9_mux_poll_stop(m);
426 m->trans = NULL;
427 p9_idpool_destroy(m->tagpool);
428 kfree(m);
429}
430
431/**
432 * p9_pollwait - called by files poll operation to add v9fs-poll task
433 * to files wait queue
434 */
435static void
436p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
437{
438 int i;
439 struct p9_conn *m;
440
441 m = container_of(p, struct p9_conn, pt);
442 for (i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
443 if (m->poll_waddr[i] == NULL)
444 break;
445
446 if (i >= ARRAY_SIZE(m->poll_waddr)) {
447 P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n");
448 return;
449 }
450
451 m->poll_waddr[i] = wait_address;
452
453 if (!wait_address) {
454 P9_DPRINTK(P9_DEBUG_ERROR, "no wait_address\n");
455 m->poll_waddr[i] = ERR_PTR(-EIO);
456 return;
457 }
458
459 init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
460 add_wait_queue(wait_address, &m->poll_wait[i]);
461}
462
463/**
464 * p9_poll_mux - polls a mux and schedules read or write works if necessary
465 */
466static void p9_poll_mux(struct p9_conn *m)
467{
468 int n;
469
470 if (m->err < 0)
471 return;
472
473 n = p9_fd_poll(m->trans, NULL);
474 if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
475 P9_DPRINTK(P9_DEBUG_MUX, "error mux %p err %d\n", m, n);
476 if (n >= 0)
477 n = -ECONNRESET;
478 p9_conn_cancel(m, n);
479 }
480
481 if (n & POLLIN) {
482 set_bit(Rpending, &m->wsched);
483 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can read\n", m);
484 if (!test_and_set_bit(Rworksched, &m->wsched)) {
485 P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
486 queue_work(p9_mux_wq, &m->rq);
487 }
488 }
489
490 if (n & POLLOUT) {
491 set_bit(Wpending, &m->wsched);
492 P9_DPRINTK(P9_DEBUG_MUX, "mux %p can write\n", m);
493 if ((m->wsize || !list_empty(&m->unsent_req_list))
494 && !test_and_set_bit(Wworksched, &m->wsched)) {
495 P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
496 queue_work(p9_mux_wq, &m->wq);
497 }
498 }
499}
500
501/**
502 * p9_poll_proc - polls all v9fs transports for new events and queues
503 * the appropriate work to the work queue
504 */
505static int p9_poll_proc(void *a)
506{
507 struct p9_conn *m, *mtmp;
508 struct p9_mux_poll_task *vpt;
509
510 vpt = a;
511 P9_DPRINTK(P9_DEBUG_MUX, "start %p %p\n", current, vpt);
512 while (!kthread_should_stop()) {
513 set_current_state(TASK_INTERRUPTIBLE);
514
515 list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
516 p9_poll_mux(m);
517 }
518
519 P9_DPRINTK(P9_DEBUG_MUX, "sleeping...\n");
520 schedule_timeout(SCHED_TIMEOUT * HZ);
521 }
522
523 __set_current_state(TASK_RUNNING);
524 P9_DPRINTK(P9_DEBUG_MUX, "finish\n");
525 return 0;
526}
527
528/**
529 * p9_write_work - called when a transport can send some data
530 */
531static void p9_write_work(struct work_struct *work)
532{
533 int n, err;
534 struct p9_conn *m;
535 struct p9_req *req;
536
537 m = container_of(work, struct p9_conn, wq);
538
539 if (m->err < 0) {
540 clear_bit(Wworksched, &m->wsched);
541 return;
542 }
543
544 if (!m->wsize) {
545 if (list_empty(&m->unsent_req_list)) {
546 clear_bit(Wworksched, &m->wsched);
547 return;
548 }
549
550 spin_lock(&m->lock);
551again:
552 req = list_entry(m->unsent_req_list.next, struct p9_req,
553 req_list);
554 list_move_tail(&req->req_list, &m->req_list);
555 if (req->err == ERREQFLUSH)
556 goto again;
557
558 m->wbuf = req->tcall->sdata;
559 m->wsize = req->tcall->size;
560 m->wpos = 0;
561 spin_unlock(&m->lock);
562 }
563
564 P9_DPRINTK(P9_DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos,
565 m->wsize);
566 clear_bit(Wpending, &m->wsched);
567 err = p9_fd_write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
568 P9_DPRINTK(P9_DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
569 if (err == -EAGAIN) {
570 clear_bit(Wworksched, &m->wsched);
571 return;
572 }
573
574 if (err < 0)
575 goto error;
576 else if (err == 0) {
577 err = -EREMOTEIO;
578 goto error;
579 }
580
581 m->wpos += err;
582 if (m->wpos == m->wsize)
583 m->wpos = m->wsize = 0;
584
585 if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
586 if (test_and_clear_bit(Wpending, &m->wsched))
587 n = POLLOUT;
588 else
589 n = p9_fd_poll(m->trans, NULL);
590
591 if (n & POLLOUT) {
592 P9_DPRINTK(P9_DEBUG_MUX, "schedule write work %p\n", m);
593 queue_work(p9_mux_wq, &m->wq);
594 } else
595 clear_bit(Wworksched, &m->wsched);
596 } else
597 clear_bit(Wworksched, &m->wsched);
598
599 return;
600
601error:
602 p9_conn_cancel(m, err);
603 clear_bit(Wworksched, &m->wsched);
604}
605
606static void process_request(struct p9_conn *m, struct p9_req *req)
607{
608 int ecode;
609 struct p9_str *ename;
610
611 if (!req->err && req->rcall->id == P9_RERROR) {
612 ecode = req->rcall->params.rerror.errno;
613 ename = &req->rcall->params.rerror.error;
614
615 P9_DPRINTK(P9_DEBUG_MUX, "Rerror %.*s\n", ename->len,
616 ename->str);
617
618 if (m->extended)
619 req->err = -ecode;
620
621 if (!req->err) {
622 req->err = p9_errstr2errno(ename->str, ename->len);
623
624 /* string match failed */
625 if (!req->err) {
626 PRINT_FCALL_ERROR("unknown error", req->rcall);
627 req->err = -ESERVERFAULT;
628 }
629 }
630 } else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
631 P9_DPRINTK(P9_DEBUG_ERROR,
632 "fcall mismatch: expected %d, got %d\n",
633 req->tcall->id + 1, req->rcall->id);
634 if (!req->err)
635 req->err = -EIO;
636 }
637}
638
639/**
640 * p9_read_work - called when there is some data to be read from a transport
641 */
642static void p9_read_work(struct work_struct *work)
643{
644 int n, err;
645 struct p9_conn *m;
646 struct p9_req *req, *rptr, *rreq;
647 struct p9_fcall *rcall;
648 char *rbuf;
649
650 m = container_of(work, struct p9_conn, rq);
651
652 if (m->err < 0)
653 return;
654
655 rcall = NULL;
656 P9_DPRINTK(P9_DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
657
658 if (!m->rcall) {
659 m->rcall =
660 kmalloc(sizeof(struct p9_fcall) + m->msize, GFP_KERNEL);
661 if (!m->rcall) {
662 err = -ENOMEM;
663 goto error;
664 }
665
666 m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
667 m->rpos = 0;
668 }
669
670 clear_bit(Rpending, &m->wsched);
671 err = p9_fd_read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
672 P9_DPRINTK(P9_DEBUG_MUX, "mux %p got %d bytes\n", m, err);
673 if (err == -EAGAIN) {
674 clear_bit(Rworksched, &m->wsched);
675 return;
676 }
677
678 if (err <= 0)
679 goto error;
680
681 m->rpos += err;
682 while (m->rpos > 4) {
683 n = le32_to_cpu(*(__le32 *) m->rbuf);
684 if (n >= m->msize) {
685 P9_DPRINTK(P9_DEBUG_ERROR,
686 "requested packet size too big: %d\n", n);
687 err = -EIO;
688 goto error;
689 }
690
691 if (m->rpos < n)
692 break;
693
694 err =
695 p9_deserialize_fcall(m->rbuf, n, m->rcall, m->extended);
696 if (err < 0)
697 goto error;
698
699#ifdef CONFIG_NET_9P_DEBUG
700 if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
701 char buf[150];
702
703 p9_printfcall(buf, sizeof(buf), m->rcall,
704 m->extended);
705 printk(KERN_NOTICE ">>> %p %s\n", m, buf);
706 }
707#endif
708
709 rcall = m->rcall;
710 rbuf = m->rbuf;
711 if (m->rpos > n) {
712 m->rcall = kmalloc(sizeof(struct p9_fcall) + m->msize,
713 GFP_KERNEL);
714 if (!m->rcall) {
715 err = -ENOMEM;
716 goto error;
717 }
718
719 m->rbuf = (char *)m->rcall + sizeof(struct p9_fcall);
720 memmove(m->rbuf, rbuf + n, m->rpos - n);
721 m->rpos -= n;
722 } else {
723 m->rcall = NULL;
724 m->rbuf = NULL;
725 m->rpos = 0;
726 }
727
728 P9_DPRINTK(P9_DEBUG_MUX, "mux %p fcall id %d tag %d\n", m,
729 rcall->id, rcall->tag);
730
731 req = NULL;
732 spin_lock(&m->lock);
733 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
734 if (rreq->tag == rcall->tag) {
735 req = rreq;
736 if (req->flush != Flushing)
737 list_del(&req->req_list);
738 break;
739 }
740 }
741 spin_unlock(&m->lock);
742
743 if (req) {
744 req->rcall = rcall;
745 process_request(m, req);
746
747 if (req->flush != Flushing) {
748 if (req->cb)
749 (*req->cb) (req, req->cba);
750 else
751 kfree(req->rcall);
752
753 wake_up(&m->equeue);
754 }
755 } else {
756 if (err >= 0 && rcall->id != P9_RFLUSH)
757 P9_DPRINTK(P9_DEBUG_ERROR,
758 "unexpected response mux %p id %d tag %d\n",
759 m, rcall->id, rcall->tag);
760 kfree(rcall);
761 }
762 }
763
764 if (!list_empty(&m->req_list)) {
765 if (test_and_clear_bit(Rpending, &m->wsched))
766 n = POLLIN;
767 else
768 n = p9_fd_poll(m->trans, NULL);
769
770 if (n & POLLIN) {
771 P9_DPRINTK(P9_DEBUG_MUX, "schedule read work %p\n", m);
772 queue_work(p9_mux_wq, &m->rq);
773 } else
774 clear_bit(Rworksched, &m->wsched);
775 } else
776 clear_bit(Rworksched, &m->wsched);
777
778 return;
779
780error:
781 p9_conn_cancel(m, err);
782 clear_bit(Rworksched, &m->wsched);
783}
784
785/**
786 * p9_send_request - send 9P request
787 * The function can sleep until the request is scheduled for sending.
788 * The function can be interrupted. Return from the function is not
789 * a guarantee that the request is sent successfully. Can return errors
790 * that can be retrieved by PTR_ERR macros.
791 *
792 * @m: mux data
793 * @tc: request to be sent
794 * @cb: callback function to call when response is received
795 * @cba: parameter to pass to the callback function
796 */
797static struct p9_req *p9_send_request(struct p9_conn *m,
798 struct p9_fcall *tc,
799 p9_conn_req_callback cb, void *cba)
800{
801 int n;
802 struct p9_req *req;
803
804 P9_DPRINTK(P9_DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
805 tc, tc->id);
806 if (m->err < 0)
807 return ERR_PTR(m->err);
808
809 req = kmalloc(sizeof(struct p9_req), GFP_KERNEL);
810 if (!req)
811 return ERR_PTR(-ENOMEM);
812
813 if (tc->id == P9_TVERSION)
814 n = P9_NOTAG;
815 else
816 n = p9_mux_get_tag(m);
817
818 if (n < 0)
819 return ERR_PTR(-ENOMEM);
820
821 p9_set_tag(tc, n);
822
823#ifdef CONFIG_NET_9P_DEBUG
824 if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
825 char buf[150];
826
827 p9_printfcall(buf, sizeof(buf), tc, m->extended);
828 printk(KERN_NOTICE "<<< %p %s\n", m, buf);
829 }
830#endif
831
832 spin_lock_init(&req->lock);
833 req->tag = n;
834 req->tcall = tc;
835 req->rcall = NULL;
836 req->err = 0;
837 req->cb = cb;
838 req->cba = cba;
839 req->flush = None;
840
841 spin_lock(&m->lock);
842 list_add_tail(&req->req_list, &m->unsent_req_list);
843 spin_unlock(&m->lock);
844
845 if (test_and_clear_bit(Wpending, &m->wsched))
846 n = POLLOUT;
847 else
848 n = p9_fd_poll(m->trans, NULL);
849
850 if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
851 queue_work(p9_mux_wq, &m->wq);
852
853 return req;
854}
855
856static void p9_mux_free_request(struct p9_conn *m, struct p9_req *req)
857{
858 p9_mux_put_tag(m, req->tag);
859 kfree(req);
860}
861
862static void p9_mux_flush_cb(struct p9_req *freq, void *a)
863{
864 p9_conn_req_callback cb;
865 int tag;
866 struct p9_conn *m;
867 struct p9_req *req, *rreq, *rptr;
868
869 m = a;
870 P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m,
871 freq->tcall, freq->rcall, freq->err,
872 freq->tcall->params.tflush.oldtag);
873
874 spin_lock(&m->lock);
875 cb = NULL;
876 tag = freq->tcall->params.tflush.oldtag;
877 req = NULL;
878 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
879 if (rreq->tag == tag) {
880 req = rreq;
881 list_del(&req->req_list);
882 break;
883 }
884 }
885 spin_unlock(&m->lock);
886
887 if (req) {
888 spin_lock(&req->lock);
889 req->flush = Flushed;
890 spin_unlock(&req->lock);
891
892 if (req->cb)
893 (*req->cb) (req, req->cba);
894 else
895 kfree(req->rcall);
896
897 wake_up(&m->equeue);
898 }
899
900 kfree(freq->tcall);
901 kfree(freq->rcall);
902 p9_mux_free_request(m, freq);
903}
904
905static int
906p9_mux_flush_request(struct p9_conn *m, struct p9_req *req)
907{
908 struct p9_fcall *fc;
909 struct p9_req *rreq, *rptr;
910
911 P9_DPRINTK(P9_DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
912
913 /* if a response was received for a request, do nothing */
914 spin_lock(&req->lock);
915 if (req->rcall || req->err) {
916 spin_unlock(&req->lock);
917 P9_DPRINTK(P9_DEBUG_MUX,
918 "mux %p req %p response already received\n", m, req);
919 return 0;
920 }
921
922 req->flush = Flushing;
923 spin_unlock(&req->lock);
924
925 spin_lock(&m->lock);
926 /* if the request is not sent yet, just remove it from the list */
927 list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) {
928 if (rreq->tag == req->tag) {
929 P9_DPRINTK(P9_DEBUG_MUX,
930 "mux %p req %p request is not sent yet\n", m, req);
931 list_del(&rreq->req_list);
932 req->flush = Flushed;
933 spin_unlock(&m->lock);
934 if (req->cb)
935 (*req->cb) (req, req->cba);
936 return 0;
937 }
938 }
939 spin_unlock(&m->lock);
940
941 clear_thread_flag(TIF_SIGPENDING);
942 fc = p9_create_tflush(req->tag);
943 p9_send_request(m, fc, p9_mux_flush_cb, m);
944 return 1;
945}
946
947static void
948p9_conn_rpc_cb(struct p9_req *req, void *a)
949{
950 struct p9_mux_rpc *r;
951
952 P9_DPRINTK(P9_DEBUG_MUX, "req %p r %p\n", req, a);
953 r = a;
954 r->rcall = req->rcall;
955 r->err = req->err;
956
957 if (req->flush != None && !req->err)
958 r->err = -ERESTARTSYS;
959
960 wake_up(&r->wqueue);
961}
962
963/**
964 * p9_fd_rpc- sends 9P request and waits until a response is available.
965 * The function can be interrupted.
966 * @m: mux data
967 * @tc: request to be sent
968 * @rc: pointer where a pointer to the response is stored
969 */
970int
971p9_fd_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
972{
973 struct p9_trans_fd *p = t->priv;
974 struct p9_conn *m = p->conn;
975 int err, sigpending;
976 unsigned long flags;
977 struct p9_req *req;
978 struct p9_mux_rpc r;
979
980 r.err = 0;
981 r.tcall = tc;
982 r.rcall = NULL;
983 r.m = m;
984 init_waitqueue_head(&r.wqueue);
985
986 if (rc)
987 *rc = NULL;
988
989 sigpending = 0;
990 if (signal_pending(current)) {
991 sigpending = 1;
992 clear_thread_flag(TIF_SIGPENDING);
993 }
994
995 req = p9_send_request(m, tc, p9_conn_rpc_cb, &r);
996 if (IS_ERR(req)) {
997 err = PTR_ERR(req);
998 P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
999 return err;
1000 }
1001
1002 err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
1003 if (r.err < 0)
1004 err = r.err;
1005
1006 if (err == -ERESTARTSYS && m->trans->status == Connected
1007 && m->err == 0) {
1008 if (p9_mux_flush_request(m, req)) {
1009 /* wait until we get response of the flush message */
1010 do {
1011 clear_thread_flag(TIF_SIGPENDING);
1012 err = wait_event_interruptible(r.wqueue,
1013 r.rcall || r.err);
1014 } while (!r.rcall && !r.err && err == -ERESTARTSYS &&
1015 m->trans->status == Connected && !m->err);
1016
1017 err = -ERESTARTSYS;
1018 }
1019 sigpending = 1;
1020 }
1021
1022 if (sigpending) {
1023 spin_lock_irqsave(&current->sighand->siglock, flags);
1024 recalc_sigpending();
1025 spin_unlock_irqrestore(&current->sighand->siglock, flags);
1026 }
1027
1028 if (rc)
1029 *rc = r.rcall;
1030 else
1031 kfree(r.rcall);
1032
1033 p9_mux_free_request(m, req);
1034 if (err > 0)
1035 err = -EIO;
1036
1037 return err;
1038}
1039
1040#ifdef P9_NONBLOCK
1041/**
1042 * p9_conn_rpcnb - sends 9P request without waiting for response.
1043 * @m: mux data
1044 * @tc: request to be sent
1045 * @cb: callback function to be called when response arrives
1046 * @cba: value to pass to the callback function
1047 */
1048int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
1049 p9_conn_req_callback cb, void *a)
1050{
1051 int err;
1052 struct p9_req *req;
1053
1054 req = p9_send_request(m, tc, cb, a);
1055 if (IS_ERR(req)) {
1056 err = PTR_ERR(req);
1057 P9_DPRINTK(P9_DEBUG_MUX, "error %d\n", err);
1058 return PTR_ERR(req);
1059 }
1060
1061 P9_DPRINTK(P9_DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
1062 return 0;
1063}
1064#endif /* P9_NONBLOCK */
1065
1066/**
1067 * p9_conn_cancel - cancel all pending requests with error
1068 * @m: mux data
1069 * @err: error code
1070 */
1071void p9_conn_cancel(struct p9_conn *m, int err)
1072{
1073 struct p9_req *req, *rtmp;
1074 LIST_HEAD(cancel_list);
1075
1076 P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
1077 m->err = err;
1078 spin_lock(&m->lock);
1079 list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
1080 list_move(&req->req_list, &cancel_list);
1081 }
1082 list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
1083 list_move(&req->req_list, &cancel_list);
1084 }
1085 spin_unlock(&m->lock);
1086
1087 list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
1088 list_del(&req->req_list);
1089 if (!req->err)
1090 req->err = err;
1091
1092 if (req->cb)
1093 (*req->cb) (req, req->cba);
1094 else
1095 kfree(req->rcall);
1096 }
1097
1098 wake_up(&m->equeue);
1099}
1100
75/** 1101/**
76 * v9fs_parse_options - parse mount options into session structure 1102 * v9fs_parse_options - parse mount options into session structure
77 * @options: options string passed from mount 1103 * @options: options string passed from mount
@@ -268,7 +1294,7 @@ end:
268} 1294}
269 1295
270/** 1296/**
271 * p9_sock_close - shutdown socket 1297 * p9_fd_close - shutdown socket
272 * @trans: private socket structure 1298 * @trans: private socket structure
273 * 1299 *
274 */ 1300 */
@@ -284,6 +1310,8 @@ static void p9_fd_close(struct p9_trans *trans)
284 if (!ts) 1310 if (!ts)
285 return; 1311 return;
286 1312
1313 p9_conn_destroy(ts->conn);
1314
287 trans->status = Disconnected; 1315 trans->status = Disconnected;
288 if (ts->rd) 1316 if (ts->rd)
289 fput(ts->rd); 1317 fput(ts->rd);
@@ -292,13 +1320,15 @@ static void p9_fd_close(struct p9_trans *trans)
292 kfree(ts); 1320 kfree(ts);
293} 1321}
294 1322
295static struct p9_trans *p9_trans_create_tcp(const char *addr, char *args) 1323static struct p9_trans *
1324p9_trans_create_tcp(const char *addr, char *args, int msize, unsigned char dotu)
296{ 1325{
297 int err; 1326 int err;
298 struct p9_trans *trans; 1327 struct p9_trans *trans;
299 struct socket *csocket; 1328 struct socket *csocket;
300 struct sockaddr_in sin_server; 1329 struct sockaddr_in sin_server;
301 struct p9_fd_opts opts; 1330 struct p9_fd_opts opts;
1331 struct p9_trans_fd *p;
302 1332
303 parse_opts(args, &opts); 1333 parse_opts(args, &opts);
304 1334
@@ -306,11 +1336,10 @@ static struct p9_trans *p9_trans_create_tcp(const char *addr, char *args)
306 trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL); 1336 trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
307 if (!trans) 1337 if (!trans)
308 return ERR_PTR(-ENOMEM); 1338 return ERR_PTR(-ENOMEM);
309 1339 trans->msize = msize;
310 trans->write = p9_fd_write; 1340 trans->extended = dotu;
311 trans->read = p9_fd_read; 1341 trans->rpc = p9_fd_rpc;
312 trans->close = p9_fd_close; 1342 trans->close = p9_fd_close;
313 trans->poll = p9_fd_poll;
314 1343
315 sin_server.sin_family = AF_INET; 1344 sin_server.sin_family = AF_INET;
316 sin_server.sin_addr.s_addr = in_aton(addr); 1345 sin_server.sin_addr.s_addr = in_aton(addr);
@@ -337,6 +1366,14 @@ static struct p9_trans *p9_trans_create_tcp(const char *addr, char *args)
337 if (err < 0) 1366 if (err < 0)
338 goto error; 1367 goto error;
339 1368
1369 p = (struct p9_trans_fd *) trans->priv;
1370 p->conn = p9_conn_create(trans);
1371 if (IS_ERR(p->conn)) {
1372 err = PTR_ERR(p->conn);
1373 p->conn = NULL;
1374 goto error;
1375 }
1376
340 return trans; 1377 return trans;
341 1378
342error: 1379error:
@@ -347,22 +1384,23 @@ error:
347 return ERR_PTR(err); 1384 return ERR_PTR(err);
348} 1385}
349 1386
350static struct p9_trans *p9_trans_create_unix(const char *addr, char *args) 1387static struct p9_trans *
1388p9_trans_create_unix(const char *addr, char *args, int msize,
1389 unsigned char dotu)
351{ 1390{
352 int err; 1391 int err;
353 struct socket *csocket; 1392 struct socket *csocket;
354 struct sockaddr_un sun_server; 1393 struct sockaddr_un sun_server;
355 struct p9_trans *trans; 1394 struct p9_trans *trans;
1395 struct p9_trans_fd *p;
356 1396
357 csocket = NULL; 1397 csocket = NULL;
358 trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL); 1398 trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
359 if (!trans) 1399 if (!trans)
360 return ERR_PTR(-ENOMEM); 1400 return ERR_PTR(-ENOMEM);
361 1401
362 trans->write = p9_fd_write; 1402 trans->rpc = p9_fd_rpc;
363 trans->read = p9_fd_read;
364 trans->close = p9_fd_close; 1403 trans->close = p9_fd_close;
365 trans->poll = p9_fd_poll;
366 1404
367 if (strlen(addr) > UNIX_PATH_MAX) { 1405 if (strlen(addr) > UNIX_PATH_MAX) {
368 P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n", 1406 P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n",
@@ -387,6 +1425,16 @@ static struct p9_trans *p9_trans_create_unix(const char *addr, char *args)
387 if (err < 0) 1425 if (err < 0)
388 goto error; 1426 goto error;
389 1427
1428 trans->msize = msize;
1429 trans->extended = dotu;
1430 p = (struct p9_trans_fd *) trans->priv;
1431 p->conn = p9_conn_create(trans);
1432 if (IS_ERR(p->conn)) {
1433 err = PTR_ERR(p->conn);
1434 p->conn = NULL;
1435 goto error;
1436 }
1437
390 return trans; 1438 return trans;
391 1439
392error: 1440error:
@@ -397,11 +1445,14 @@ error:
397 return ERR_PTR(err); 1445 return ERR_PTR(err);
398} 1446}
399 1447
400static struct p9_trans *p9_trans_create_fd(const char *name, char *args) 1448static struct p9_trans *
1449p9_trans_create_fd(const char *name, char *args, int msize,
1450 unsigned char extended)
401{ 1451{
402 int err; 1452 int err;
403 struct p9_trans *trans; 1453 struct p9_trans *trans;
404 struct p9_fd_opts opts; 1454 struct p9_fd_opts opts;
1455 struct p9_trans_fd *p;
405 1456
406 parse_opts(args, &opts); 1457 parse_opts(args, &opts);
407 1458
@@ -414,15 +1465,23 @@ static struct p9_trans *p9_trans_create_fd(const char *name, char *args)
414 if (!trans) 1465 if (!trans)
415 return ERR_PTR(-ENOMEM); 1466 return ERR_PTR(-ENOMEM);
416 1467
417 trans->write = p9_fd_write; 1468 trans->rpc = p9_fd_rpc;
418 trans->read = p9_fd_read;
419 trans->close = p9_fd_close; 1469 trans->close = p9_fd_close;
420 trans->poll = p9_fd_poll;
421 1470
422 err = p9_fd_open(trans, opts.rfd, opts.wfd); 1471 err = p9_fd_open(trans, opts.rfd, opts.wfd);
423 if (err < 0) 1472 if (err < 0)
424 goto error; 1473 goto error;
425 1474
1475 trans->msize = msize;
1476 trans->extended = extended;
1477 p = (struct p9_trans_fd *) trans->priv;
1478 p->conn = p9_conn_create(trans);
1479 if (IS_ERR(p->conn)) {
1480 err = PTR_ERR(p->conn);
1481 p->conn = NULL;
1482 goto error;
1483 }
1484
426 return trans; 1485 return trans;
427 1486
428error: 1487error:
@@ -453,6 +1512,12 @@ static struct p9_trans_module p9_fd_trans = {
453 1512
454static int __init p9_trans_fd_init(void) 1513static int __init p9_trans_fd_init(void)
455{ 1514{
1515 int ret = p9_mux_global_init();
1516 if (ret) {
1517 printk(KERN_WARNING "9p: starting mux failed\n");
1518 return ret;
1519 }
1520
456 v9fs_register_trans(&p9_tcp_trans); 1521 v9fs_register_trans(&p9_tcp_trans);
457 v9fs_register_trans(&p9_unix_trans); 1522 v9fs_register_trans(&p9_unix_trans);
458 v9fs_register_trans(&p9_fd_trans); 1523 v9fs_register_trans(&p9_fd_trans);
@@ -460,13 +1525,7 @@ static int __init p9_trans_fd_init(void)
460 return 1; 1525 return 1;
461} 1526}
462 1527
463static void __exit p9_trans_fd_exit(void) {
464 printk(KERN_ERR "Removal of 9p transports not implemented\n");
465 BUG();
466}
467
468module_init(p9_trans_fd_init); 1528module_init(p9_trans_fd_init);
469module_exit(p9_trans_fd_exit);
470 1529
471MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); 1530MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
472MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); 1531MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 40b71a29fc3f..0117b9fb8480 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -1,17 +1,8 @@
1/* 1/*
2 * The Guest 9p transport driver 2 * The Guest 9p transport driver
3 * 3 *
4 * This is a trivial pipe-based transport driver based on the lguest console 4 * This is a block based transport driver based on the lguest block driver
5 * code: we use lguest's DMA mechanism to send bytes out, and register a 5 * code.
6 * DMA buffer to receive bytes in. It is assumed to be present and available
7 * from the very beginning of boot.
8 *
9 * This may be have been done by just instaniating another HVC console,
10 * but HVC's blocksize of 16 bytes is annoying and painful to performance.
11 *
12 * A more efficient transport could be built based on the virtio block driver
13 * but it requires some changes in the 9p transport model (which are in
14 * progress)
15 * 6 *
16 */ 7 */
17/* 8/*
@@ -55,11 +46,25 @@
55#include <linux/virtio.h> 46#include <linux/virtio.h>
56#include <linux/virtio_9p.h> 47#include <linux/virtio_9p.h>
57 48
49#define VIRTQUEUE_NUM 128
50
58/* a single mutex to manage channel initialization and attachment */ 51/* a single mutex to manage channel initialization and attachment */
59static DECLARE_MUTEX(virtio_9p_lock); 52static DECLARE_MUTEX(virtio_9p_lock);
60/* global which tracks highest initialized channel */ 53/* global which tracks highest initialized channel */
61static int chan_index; 54static int chan_index;
62 55
56#define P9_INIT_MAXTAG 16
57
58#define REQ_STATUS_IDLE 0
59#define REQ_STATUS_SENT 1
60#define REQ_STATUS_RCVD 2
61#define REQ_STATUS_FLSH 3
62
63struct p9_req_t {
64 int status;
65 wait_queue_head_t *wq;
66};
67
63/* We keep all per-channel information in a structure. 68/* We keep all per-channel information in a structure.
64 * This structure is allocated within the devices dev->mem space. 69 * This structure is allocated within the devices dev->mem space.
65 * A pointer to the structure will get put in the transport private. 70 * A pointer to the structure will get put in the transport private.
@@ -68,148 +73,198 @@ static struct virtio_chan {
68 bool initialized; /* channel is initialized */ 73 bool initialized; /* channel is initialized */
69 bool inuse; /* channel is in use */ 74 bool inuse; /* channel is in use */
70 75
71 struct virtqueue *in_vq, *out_vq; 76 spinlock_t lock;
77
72 struct virtio_device *vdev; 78 struct virtio_device *vdev;
79 struct virtqueue *vq;
73 80
74 /* This is our input buffer, and how much data is left in it. */ 81 struct p9_idpool *tagpool;
75 unsigned int in_len; 82 struct p9_req_t *reqs;
76 char *in, *inbuf; 83 int max_tag;
77 84
78 wait_queue_head_t wq; /* waitq for buffer */ 85 /* Scatterlist: can be too big for stack. */
86 struct scatterlist sg[VIRTQUEUE_NUM];
79} channels[MAX_9P_CHAN]; 87} channels[MAX_9P_CHAN];
80 88
89/* Lookup requests by tag */
90static struct p9_req_t *p9_lookup_tag(struct virtio_chan *c, u16 tag)
91{
92 /* This looks up the original request by tag so we know which
93 * buffer to read the data into */
94 tag++;
95
96 while (tag >= c->max_tag) {
97 int old_max = c->max_tag;
98 int count;
99
100 if (c->max_tag)
101 c->max_tag *= 2;
102 else
103 c->max_tag = P9_INIT_MAXTAG;
104
105 c->reqs = krealloc(c->reqs, sizeof(struct p9_req_t)*c->max_tag,
106 GFP_ATOMIC);
107 if (!c->reqs) {
108 printk(KERN_ERR "Couldn't grow tag array\n");
109 BUG();
110 }
111 for (count = old_max; count < c->max_tag; count++) {
112 c->reqs[count].status = REQ_STATUS_IDLE;
113 c->reqs[count].wq = kmalloc(sizeof(wait_queue_t),
114 GFP_ATOMIC);
115 if (!c->reqs[count].wq) {
116 printk(KERN_ERR "Couldn't grow tag array\n");
117 BUG();
118 }
119 init_waitqueue_head(c->reqs[count].wq);
120 }
121 }
122
123 return &c->reqs[tag];
124}
125
126
81/* How many bytes left in this page. */ 127/* How many bytes left in this page. */
82static unsigned int rest_of_page(void *data) 128static unsigned int rest_of_page(void *data)
83{ 129{
84 return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); 130 return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
85} 131}
86 132
87static int p9_virtio_write(struct p9_trans *trans, void *buf, int count) 133static void p9_virtio_close(struct p9_trans *trans)
88{ 134{
89 struct virtio_chan *chan = (struct virtio_chan *) trans->priv; 135 struct virtio_chan *chan = trans->priv;
90 struct virtqueue *out_vq = chan->out_vq; 136 int count;
91 struct scatterlist sg[1]; 137 unsigned int flags;
92 unsigned int len;
93 138
94 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio write (%d)\n", count); 139 spin_lock_irqsave(&chan->lock, flags);
140 p9_idpool_destroy(chan->tagpool);
141 for (count = 0; count < chan->max_tag; count++)
142 kfree(chan->reqs[count].wq);
143 kfree(chan->reqs);
144 chan->max_tag = 0;
145 spin_unlock_irqrestore(&chan->lock, flags);
95 146
96 /* keep it simple - make sure we don't overflow a page */ 147 down(&virtio_9p_lock);
97 if (rest_of_page(buf) < count) 148 chan->inuse = false;
98 count = rest_of_page(buf); 149 up(&virtio_9p_lock);
99 150
100 sg_init_one(sg, buf, count); 151 kfree(trans);
152}
101 153
102 /* add_buf wants a token to identify this buffer: we hand it any 154static void req_done(struct virtqueue *vq)
103 * non-NULL pointer, since there's only ever one buffer. */ 155{
104 if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) { 156 struct virtio_chan *chan = vq->vdev->priv;
105 /* Tell Host to go! */ 157 struct p9_fcall *rc;
106 out_vq->vq_ops->kick(out_vq); 158 unsigned int len;
107 /* Chill out until it's done with the buffer. */ 159 unsigned long flags;
108 while (!out_vq->vq_ops->get_buf(out_vq, &len)) 160 struct p9_req_t *req;
109 cpu_relax(); 161
162 spin_lock_irqsave(&chan->lock, flags);
163 while ((rc = chan->vq->vq_ops->get_buf(chan->vq, &len)) != NULL) {
164 req = p9_lookup_tag(chan, rc->tag);
165 req->status = REQ_STATUS_RCVD;
166 wake_up(req->wq);
110 } 167 }
111 168 /* In case queue is stopped waiting for more buffers. */
112 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio wrote (%d)\n", count); 169 spin_unlock_irqrestore(&chan->lock, flags);
113
114 /* We're expected to return the amount of data we wrote: all of it. */
115 return count;
116} 170}
117 171
118/* Create a scatter-gather list representing our input buffer and put it in the 172static int
119 * queue. */ 173pack_sg_list(struct scatterlist *sg, int start, int limit, char *data,
120static void add_inbuf(struct virtio_chan *chan) 174 int count)
121{ 175{
122 struct scatterlist sg[1]; 176 int s;
123 177 int index = start;
124 sg_init_one(sg, chan->inbuf, PAGE_SIZE); 178
179 while (count) {
180 s = rest_of_page(data);
181 if (s > count)
182 s = count;
183 sg_set_buf(&sg[index++], data, s);
184 count -= s;
185 data += s;
186 if (index > limit)
187 BUG();
188 }
125 189
126 /* We should always be able to add one buffer to an empty queue. */ 190 return index-start;
127 if (chan->in_vq->vq_ops->add_buf(chan->in_vq, sg, 0, 1, chan->inbuf))
128 BUG();
129 chan->in_vq->vq_ops->kick(chan->in_vq);
130} 191}
131 192
132static int p9_virtio_read(struct p9_trans *trans, void *buf, int count) 193static int
194p9_virtio_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
133{ 195{
134 struct virtio_chan *chan = (struct virtio_chan *) trans->priv; 196 int in, out;
135 struct virtqueue *in_vq = chan->in_vq; 197 int n, err, size;
136 198 struct virtio_chan *chan = t->priv;
137 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio read (%d)\n", count); 199 char *rdata;
200 struct p9_req_t *req;
201 unsigned long flags;
202
203 if (*rc == NULL) {
204 *rc = kmalloc(sizeof(struct p9_fcall) + t->msize, GFP_KERNEL);
205 if (!*rc)
206 return -ENOMEM;
207 }
138 208
139 /* If we don't have an input queue yet, we can't get input. */ 209 rdata = (char *)*rc+sizeof(struct p9_fcall);
140 BUG_ON(!in_vq);
141 210
142 /* No buffer? Try to get one. */ 211 n = P9_NOTAG;
143 if (!chan->in_len) { 212 if (tc->id != P9_TVERSION) {
144 chan->in = in_vq->vq_ops->get_buf(in_vq, &chan->in_len); 213 n = p9_idpool_get(chan->tagpool);
145 if (!chan->in) 214 if (n < 0)
146 return 0; 215 return -ENOMEM;
147 } 216 }
148 217
149 /* You want more than we have to give? Well, try wanting less! */ 218 spin_lock_irqsave(&chan->lock, flags);
150 if (chan->in_len < count) 219 req = p9_lookup_tag(chan, n);
151 count = chan->in_len; 220 spin_unlock_irqrestore(&chan->lock, flags);
152 221
153 /* Copy across to their buffer and increment offset. */ 222 p9_set_tag(tc, n);
154 memcpy(buf, chan->in, count);
155 chan->in += count;
156 chan->in_len -= count;
157 223
158 /* Finished? Re-register buffer so Host will use it again. */ 224 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio rpc tag %d\n", n);
159 if (chan->in_len == 0)
160 add_inbuf(chan);
161 225
162 P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio finished read (%d)\n", 226 out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, tc->sdata, tc->size);
163 count); 227 in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM-out, rdata, t->msize);
164
165 return count;
166}
167 228
168/* The poll function is used by 9p transports to determine if there 229 req->status = REQ_STATUS_SENT;
169 * is there is activity available on a particular channel. In our case
170 * we use it to wait for a callback from the input routines.
171 */
172static unsigned int
173p9_virtio_poll(struct p9_trans *trans, struct poll_table_struct *pt)
174{
175 struct virtio_chan *chan = (struct virtio_chan *)trans->priv;
176 struct virtqueue *in_vq = chan->in_vq;
177 int ret = POLLOUT; /* we can always handle more output */
178 230
179 poll_wait(NULL, &chan->wq, pt); 231 if (chan->vq->vq_ops->add_buf(chan->vq, chan->sg, out, in, tc)) {
232 P9_DPRINTK(P9_DEBUG_TRANS,
233 "9p debug: virtio rpc add_buf returned failure");
234 return -EIO;
235 }
180 236
181 /* No buffer? Try to get one. */ 237 chan->vq->vq_ops->kick(chan->vq);
182 if (!chan->in_len)
183 chan->in = in_vq->vq_ops->get_buf(in_vq, &chan->in_len);
184 238
185 if (chan->in_len) 239 wait_event(*req->wq, req->status == REQ_STATUS_RCVD);
186 ret |= POLLIN;
187 240
188 return ret; 241 size = le32_to_cpu(*(__le32 *) rdata);
189}
190 242
191static void p9_virtio_close(struct p9_trans *trans) 243 err = p9_deserialize_fcall(rdata, size, *rc, t->extended);
192{ 244 if (err < 0) {
193 struct virtio_chan *chan = trans->priv; 245 P9_DPRINTK(P9_DEBUG_TRANS,
246 "9p debug: virtio rpc deserialize returned %d\n", err);
247 return err;
248 }
194 249
195 down(&virtio_9p_lock); 250#ifdef CONFIG_NET_9P_DEBUG
196 chan->inuse = false; 251 if ((p9_debug_level&P9_DEBUG_FCALL) == P9_DEBUG_FCALL) {
197 up(&virtio_9p_lock); 252 char buf[150];
198 253
199 kfree(trans); 254 p9_printfcall(buf, sizeof(buf), *rc, t->extended);
200} 255 printk(KERN_NOTICE ">>> %p %s\n", t, buf);
256 }
257#endif
201 258
202static bool p9_virtio_intr(struct virtqueue *q) 259 if (n != P9_NOTAG && p9_idpool_check(n, chan->tagpool))
203{ 260 p9_idpool_put(n, chan->tagpool);
204 struct virtio_chan *chan = q->vdev->priv;
205 261
206 P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq); 262 req->status = REQ_STATUS_IDLE;
207 wake_up_interruptible(&chan->wq);
208 263
209 return true; 264 return 0;
210} 265}
211 266
212static int p9_virtio_probe(struct virtio_device *dev) 267static int p9_virtio_probe(struct virtio_device *vdev)
213{ 268{
214 int err; 269 int err;
215 struct virtio_chan *chan; 270 struct virtio_chan *chan;
@@ -223,44 +278,29 @@ static int p9_virtio_probe(struct virtio_device *dev)
223 if (chan_index > MAX_9P_CHAN) { 278 if (chan_index > MAX_9P_CHAN) {
224 printk(KERN_ERR "9p: virtio: Maximum channels exceeded\n"); 279 printk(KERN_ERR "9p: virtio: Maximum channels exceeded\n");
225 BUG(); 280 BUG();
226 }
227
228 chan->vdev = dev;
229
230 /* This is the scratch page we use to receive console input */
231 chan->inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
232 if (!chan->inbuf) {
233 err = -ENOMEM; 281 err = -ENOMEM;
234 goto fail; 282 goto fail;
235 } 283 }
236 284
237 /* Find the input queue. */ 285 chan->vdev = vdev;
238 dev->priv = chan;
239 chan->in_vq = dev->config->find_vq(dev, p9_virtio_intr);
240 if (IS_ERR(chan->in_vq)) {
241 err = PTR_ERR(chan->in_vq);
242 goto free;
243 }
244 286
245 chan->out_vq = dev->config->find_vq(dev, NULL); 287 /* We expect one virtqueue, for requests. */
246 if (IS_ERR(chan->out_vq)) { 288 chan->vq = vdev->config->find_vq(vdev, 0, req_done);
247 err = PTR_ERR(chan->out_vq); 289 if (IS_ERR(chan->vq)) {
248 goto free_in_vq; 290 err = PTR_ERR(chan->vq);
291 goto out_free_vq;
249 } 292 }
293 chan->vq->vdev->priv = chan;
294 spin_lock_init(&chan->lock);
250 295
251 init_waitqueue_head(&chan->wq); 296 sg_init_table(chan->sg, VIRTQUEUE_NUM);
252 297
253 /* Register the input buffer the first time. */
254 add_inbuf(chan);
255 chan->inuse = false; 298 chan->inuse = false;
256 chan->initialized = true; 299 chan->initialized = true;
257
258 return 0; 300 return 0;
259 301
260free_in_vq: 302out_free_vq:
261 dev->config->del_vq(chan->in_vq); 303 vdev->config->del_vq(chan->vq);
262free:
263 kfree(chan->inbuf);
264fail: 304fail:
265 down(&virtio_9p_lock); 305 down(&virtio_9p_lock);
266 chan_index--; 306 chan_index--;
@@ -273,11 +313,13 @@ fail:
273 * alternate channels by matching devname versus a virtio_config entry. 313 * alternate channels by matching devname versus a virtio_config entry.
274 * We use a simple reference count mechanism to ensure that only a single 314 * We use a simple reference count mechanism to ensure that only a single
275 * mount has a channel open at a time. */ 315 * mount has a channel open at a time. */
276static struct p9_trans *p9_virtio_create(const char *devname, char *args) 316static struct p9_trans *
317p9_virtio_create(const char *devname, char *args, int msize,
318 unsigned char extended)
277{ 319{
278 struct p9_trans *trans; 320 struct p9_trans *trans;
279 int index = 0;
280 struct virtio_chan *chan = channels; 321 struct virtio_chan *chan = channels;
322 int index = 0;
281 323
282 down(&virtio_9p_lock); 324 down(&virtio_9p_lock);
283 while (index < MAX_9P_CHAN) { 325 while (index < MAX_9P_CHAN) {
@@ -292,25 +334,45 @@ static struct p9_trans *p9_virtio_create(const char *devname, char *args)
292 up(&virtio_9p_lock); 334 up(&virtio_9p_lock);
293 335
294 if (index >= MAX_9P_CHAN) { 336 if (index >= MAX_9P_CHAN) {
295 printk(KERN_ERR "9p: virtio: couldn't find a free channel\n"); 337 printk(KERN_ERR "9p: no channels available\n");
296 return NULL; 338 return ERR_PTR(-ENODEV);
297 } 339 }
298 340
341 chan->tagpool = p9_idpool_create();
342 if (IS_ERR(chan->tagpool)) {
343 printk(KERN_ERR "9p: couldn't allocate tagpool\n");
344 return ERR_PTR(-ENOMEM);
345 }
346 p9_idpool_get(chan->tagpool); /* reserve tag 0 */
347 chan->max_tag = 0;
348 chan->reqs = NULL;
349
299 trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL); 350 trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
300 if (!trans) { 351 if (!trans) {
301 printk(KERN_ERR "9p: couldn't allocate transport\n"); 352 printk(KERN_ERR "9p: couldn't allocate transport\n");
302 return ERR_PTR(-ENOMEM); 353 return ERR_PTR(-ENOMEM);
303 } 354 }
304 355 trans->extended = extended;
305 trans->write = p9_virtio_write; 356 trans->msize = msize;
306 trans->read = p9_virtio_read;
307 trans->close = p9_virtio_close; 357 trans->close = p9_virtio_close;
308 trans->poll = p9_virtio_poll; 358 trans->rpc = p9_virtio_rpc;
309 trans->priv = chan; 359 trans->priv = chan;
310 360
311 return trans; 361 return trans;
312} 362}
313 363
364static void p9_virtio_remove(struct virtio_device *vdev)
365{
366 struct virtio_chan *chan = vdev->priv;
367
368 BUG_ON(chan->inuse);
369
370 if (chan->initialized) {
371 vdev->config->del_vq(chan->vq);
372 chan->initialized = false;
373 }
374}
375
314#define VIRTIO_ID_9P 9 376#define VIRTIO_ID_9P 9
315 377
316static struct virtio_device_id id_table[] = { 378static struct virtio_device_id id_table[] = {
@@ -324,12 +386,13 @@ static struct virtio_driver p9_virtio_drv = {
324 .driver.owner = THIS_MODULE, 386 .driver.owner = THIS_MODULE,
325 .id_table = id_table, 387 .id_table = id_table,
326 .probe = p9_virtio_probe, 388 .probe = p9_virtio_probe,
389 .remove = p9_virtio_remove,
327}; 390};
328 391
329static struct p9_trans_module p9_virtio_trans = { 392static struct p9_trans_module p9_virtio_trans = {
330 .name = "virtio", 393 .name = "virtio",
331 .create = p9_virtio_create, 394 .create = p9_virtio_create,
332 .maxsize = PAGE_SIZE, 395 .maxsize = PAGE_SIZE*16,
333 .def = 0, 396 .def = 0,
334}; 397};
335 398
@@ -345,7 +408,13 @@ static int __init p9_virtio_init(void)
345 return register_virtio_driver(&p9_virtio_drv); 408 return register_virtio_driver(&p9_virtio_drv);
346} 409}
347 410
411static void __exit p9_virtio_cleanup(void)
412{
413 unregister_virtio_driver(&p9_virtio_drv);
414}
415
348module_init(p9_virtio_init); 416module_init(p9_virtio_init);
417module_exit(p9_virtio_cleanup);
349 418
350MODULE_DEVICE_TABLE(virtio, id_table); 419MODULE_DEVICE_TABLE(virtio, id_table);
351MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); 420MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
diff --git a/net/9p/util.c b/net/9p/util.c
index 22077b79395d..ef7215565d88 100644
--- a/net/9p/util.c
+++ b/net/9p/util.c
@@ -33,7 +33,7 @@
33#include <net/9p/9p.h> 33#include <net/9p/9p.h>
34 34
35struct p9_idpool { 35struct p9_idpool {
36 struct semaphore lock; 36 spinlock_t lock;
37 struct idr pool; 37 struct idr pool;
38}; 38};
39 39
@@ -45,7 +45,7 @@ struct p9_idpool *p9_idpool_create(void)
45 if (!p) 45 if (!p)
46 return ERR_PTR(-ENOMEM); 46 return ERR_PTR(-ENOMEM);
47 47
48 init_MUTEX(&p->lock); 48 spin_lock_init(&p->lock);
49 idr_init(&p->pool); 49 idr_init(&p->pool);
50 50
51 return p; 51 return p;
@@ -71,19 +71,17 @@ int p9_idpool_get(struct p9_idpool *p)
71{ 71{
72 int i = 0; 72 int i = 0;
73 int error; 73 int error;
74 unsigned int flags;
74 75
75retry: 76retry:
76 if (idr_pre_get(&p->pool, GFP_KERNEL) == 0) 77 if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
77 return 0; 78 return 0;
78 79
79 if (down_interruptible(&p->lock) == -EINTR) { 80 spin_lock_irqsave(&p->lock, flags);
80 P9_EPRINTK(KERN_WARNING, "Interrupted while locking\n");
81 return -1;
82 }
83 81
84 /* no need to store exactly p, we just need something non-null */ 82 /* no need to store exactly p, we just need something non-null */
85 error = idr_get_new(&p->pool, p, &i); 83 error = idr_get_new(&p->pool, p, &i);
86 up(&p->lock); 84 spin_unlock_irqrestore(&p->lock, flags);
87 85
88 if (error == -EAGAIN) 86 if (error == -EAGAIN)
89 goto retry; 87 goto retry;
@@ -104,12 +102,10 @@ EXPORT_SYMBOL(p9_idpool_get);
104 102
105void p9_idpool_put(int id, struct p9_idpool *p) 103void p9_idpool_put(int id, struct p9_idpool *p)
106{ 104{
107 if (down_interruptible(&p->lock) == -EINTR) { 105 unsigned int flags;
108 P9_EPRINTK(KERN_WARNING, "Interrupted while locking\n"); 106 spin_lock_irqsave(&p->lock, flags);
109 return;
110 }
111 idr_remove(&p->pool, id); 107 idr_remove(&p->pool, id);
112 up(&p->lock); 108 spin_unlock_irqrestore(&p->lock, flags);
113} 109}
114EXPORT_SYMBOL(p9_idpool_put); 110EXPORT_SYMBOL(p9_idpool_put);
115 111
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 782a22602b86..519cdb920f93 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -135,8 +135,8 @@ static void __hidp_copy_session(struct hidp_session *session, struct hidp_connin
135 } 135 }
136} 136}
137 137
138static inline int hidp_queue_event(struct hidp_session *session, struct input_dev *dev, 138static int hidp_queue_event(struct hidp_session *session, struct input_dev *dev,
139 unsigned int type, unsigned int code, int value) 139 unsigned int type, unsigned int code, int value)
140{ 140{
141 unsigned char newleds; 141 unsigned char newleds;
142 struct sk_buff *skb; 142 struct sk_buff *skb;
@@ -243,7 +243,8 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
243 input_sync(dev); 243 input_sync(dev);
244} 244}
245 245
246static inline int hidp_queue_report(struct hidp_session *session, unsigned char *data, int size) 246static int hidp_queue_report(struct hidp_session *session,
247 unsigned char *data, int size)
247{ 248{
248 struct sk_buff *skb; 249 struct sk_buff *skb;
249 250
@@ -287,7 +288,7 @@ static void hidp_idle_timeout(unsigned long arg)
287 hidp_schedule(session); 288 hidp_schedule(session);
288} 289}
289 290
290static inline void hidp_set_timer(struct hidp_session *session) 291static void hidp_set_timer(struct hidp_session *session)
291{ 292{
292 if (session->idle_to > 0) 293 if (session->idle_to > 0)
293 mod_timer(&session->timer, jiffies + HZ * session->idle_to); 294 mod_timer(&session->timer, jiffies + HZ * session->idle_to);
@@ -332,7 +333,8 @@ static inline int hidp_send_ctrl_message(struct hidp_session *session,
332 return err; 333 return err;
333} 334}
334 335
335static inline void hidp_process_handshake(struct hidp_session *session, unsigned char param) 336static void hidp_process_handshake(struct hidp_session *session,
337 unsigned char param)
336{ 338{
337 BT_DBG("session %p param 0x%02x", session, param); 339 BT_DBG("session %p param 0x%02x", session, param);
338 340
@@ -365,38 +367,23 @@ static inline void hidp_process_handshake(struct hidp_session *session, unsigned
365 } 367 }
366} 368}
367 369
368static inline void hidp_process_hid_control(struct hidp_session *session, unsigned char param) 370static void hidp_process_hid_control(struct hidp_session *session,
371 unsigned char param)
369{ 372{
370 BT_DBG("session %p param 0x%02x", session, param); 373 BT_DBG("session %p param 0x%02x", session, param);
371 374
372 switch (param) { 375 if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) {
373 case HIDP_CTRL_NOP:
374 break;
375
376 case HIDP_CTRL_VIRTUAL_CABLE_UNPLUG:
377 /* Flush the transmit queues */ 376 /* Flush the transmit queues */
378 skb_queue_purge(&session->ctrl_transmit); 377 skb_queue_purge(&session->ctrl_transmit);
379 skb_queue_purge(&session->intr_transmit); 378 skb_queue_purge(&session->intr_transmit);
380 379
381 /* Kill session thread */ 380 /* Kill session thread */
382 atomic_inc(&session->terminate); 381 atomic_inc(&session->terminate);
383 break;
384
385 case HIDP_CTRL_HARD_RESET:
386 case HIDP_CTRL_SOFT_RESET:
387 case HIDP_CTRL_SUSPEND:
388 case HIDP_CTRL_EXIT_SUSPEND:
389 /* FIXME: We have to parse these and return no error */
390 break;
391
392 default:
393 __hidp_send_ctrl_message(session,
394 HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
395 break;
396 } 382 }
397} 383}
398 384
399static inline void hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param) 385static void hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
386 unsigned char param)
400{ 387{
401 BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param); 388 BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
402 389
@@ -423,7 +410,8 @@ static inline void hidp_process_data(struct hidp_session *session, struct sk_buf
423 } 410 }
424} 411}
425 412
426static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_buff *skb) 413static void hidp_recv_ctrl_frame(struct hidp_session *session,
414 struct sk_buff *skb)
427{ 415{
428 unsigned char hdr, type, param; 416 unsigned char hdr, type, param;
429 417
@@ -457,7 +445,8 @@ static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_
457 kfree_skb(skb); 445 kfree_skb(skb);
458} 446}
459 447
460static inline void hidp_recv_intr_frame(struct hidp_session *session, struct sk_buff *skb) 448static void hidp_recv_intr_frame(struct hidp_session *session,
449 struct sk_buff *skb)
461{ 450{
462 unsigned char hdr; 451 unsigned char hdr;
463 452
@@ -625,7 +614,8 @@ static struct device *hidp_get_device(struct hidp_session *session)
625 return conn ? &conn->dev : NULL; 614 return conn ? &conn->dev : NULL;
626} 615}
627 616
628static inline int hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req) 617static int hidp_setup_input(struct hidp_session *session,
618 struct hidp_connadd_req *req)
629{ 619{
630 struct input_dev *input = session->input; 620 struct input_dev *input = session->input;
631 int i; 621 int i;
@@ -702,7 +692,8 @@ static void hidp_setup_quirks(struct hid_device *hid)
702 hid->quirks = hidp_blacklist[n].quirks; 692 hid->quirks = hidp_blacklist[n].quirks;
703} 693}
704 694
705static inline void hidp_setup_hid(struct hidp_session *session, struct hidp_connadd_req *req) 695static void hidp_setup_hid(struct hidp_session *session,
696 struct hidp_connadd_req *req)
706{ 697{
707 struct hid_device *hid = session->hid; 698 struct hid_device *hid = session->hid;
708 struct hid_report *report; 699 struct hid_report *report;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 788c70321858..e4c779bb8d76 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -429,7 +429,8 @@ static int rfcomm_release_dev(void __user *arg)
429 if (dev->tty) 429 if (dev->tty)
430 tty_vhangup(dev->tty); 430 tty_vhangup(dev->tty);
431 431
432 rfcomm_dev_del(dev); 432 if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
433 rfcomm_dev_del(dev);
433 rfcomm_dev_put(dev); 434 rfcomm_dev_put(dev);
434 return 0; 435 return 0;
435} 436}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 26e941d912e8..7b660834a4c2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
287 * @ops: pernet operations structure to manipulate 287 * @ops: pernet operations structure to manipulate
288 * 288 *
289 * Remove the pernet operations structure from the list to be 289 * Remove the pernet operations structure from the list to be
290 * used when network namespaces are created or destoryed. In 290 * used when network namespaces are created or destroyed. In
291 * addition run the exit method for all existing network 291 * addition run the exit method for all existing network
292 * namespaces. 292 * namespaces.
293 */ 293 */
@@ -335,7 +335,7 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
335 * @ops: pernet operations structure to manipulate 335 * @ops: pernet operations structure to manipulate
336 * 336 *
337 * Remove the pernet operations structure from the list to be 337 * Remove the pernet operations structure from the list to be
338 * used when network namespaces are created or destoryed. In 338 * used when network namespaces are created or destroyed. In
339 * addition run the exit method for all existing network 339 * addition run the exit method for all existing network
340 * namespaces. 340 * namespaces.
341 */ 341 */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ddbdde82a700..61ac8d06292c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -82,32 +82,6 @@ int rtnl_trylock(void)
82 return mutex_trylock(&rtnl_mutex); 82 return mutex_trylock(&rtnl_mutex);
83} 83}
84 84
85int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
86{
87 memset(tb, 0, sizeof(struct rtattr*)*maxattr);
88
89 while (RTA_OK(rta, len)) {
90 unsigned flavor = rta->rta_type;
91 if (flavor && flavor <= maxattr)
92 tb[flavor-1] = rta;
93 rta = RTA_NEXT(rta, len);
94 }
95 return 0;
96}
97
98int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
99 struct rtattr *rta, int len)
100{
101 if (RTA_PAYLOAD(rta) < len)
102 return -1;
103 if (RTA_PAYLOAD(rta) >= RTA_ALIGN(len) + sizeof(struct rtattr)) {
104 rta = RTA_DATA(rta) + RTA_ALIGN(len);
105 return rtattr_parse_nested(tb, maxattr, rta);
106 }
107 memset(tb, 0, sizeof(struct rtattr *) * maxattr);
108 return 0;
109}
110
111static struct rtnl_link *rtnl_msg_handlers[NPROTO]; 85static struct rtnl_link *rtnl_msg_handlers[NPROTO];
112 86
113static inline int rtm_msgindex(int msgtype) 87static inline int rtm_msgindex(int msgtype)
@@ -442,21 +416,6 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
442 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); 416 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
443} 417}
444 418
445size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
446{
447 size_t ret = RTA_PAYLOAD(rta);
448 char *src = RTA_DATA(rta);
449
450 if (ret > 0 && src[ret - 1] == '\0')
451 ret--;
452 if (size > 0) {
453 size_t len = (ret >= size) ? size - 1 : ret;
454 memset(dest, 0, size);
455 memcpy(dest, src, len);
456 }
457 return ret;
458}
459
460int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) 419int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
461{ 420{
462 struct sock *rtnl = net->rtnl; 421 struct sock *rtnl = net->rtnl;
@@ -1411,9 +1370,6 @@ void __init rtnetlink_init(void)
1411} 1370}
1412 1371
1413EXPORT_SYMBOL(__rta_fill); 1372EXPORT_SYMBOL(__rta_fill);
1414EXPORT_SYMBOL(rtattr_strlcpy);
1415EXPORT_SYMBOL(rtattr_parse);
1416EXPORT_SYMBOL(__rtattr_parse_nested_compat);
1417EXPORT_SYMBOL(rtnetlink_put_metrics); 1373EXPORT_SYMBOL(rtnetlink_put_metrics);
1418EXPORT_SYMBOL(rtnl_lock); 1374EXPORT_SYMBOL(rtnl_lock);
1419EXPORT_SYMBOL(rtnl_trylock); 1375EXPORT_SYMBOL(rtnl_trylock);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 98420f9c4b6d..4e354221ec23 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2461,6 +2461,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
2461 return elt; 2461 return elt;
2462} 2462}
2463 2463
2464/**
2465 * skb_partial_csum_set - set up and verify partial csum values for packet
2466 * @skb: the skb to set
2467 * @start: the number of bytes after skb->data to start checksumming.
2468 * @off: the offset from start to place the checksum.
2469 *
2470 * For untrusted partially-checksummed packets, we need to make sure the values
2471 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
2472 *
2473 * This function checks and sets those values and skb->ip_summed: if this
2474 * returns false you should drop the packet.
2475 */
2476bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
2477{
2478 if (unlikely(start > skb->len - 2) ||
2479 unlikely((int)start + off > skb->len - 2)) {
2480 if (net_ratelimit())
2481 printk(KERN_WARNING
2482 "bad partial csum: csum=%u/%u len=%u\n",
2483 start, off, skb->len);
2484 return false;
2485 }
2486 skb->ip_summed = CHECKSUM_PARTIAL;
2487 skb->csum_start = skb_headroom(skb) + start;
2488 skb->csum_offset = off;
2489 return true;
2490}
2491
2464EXPORT_SYMBOL(___pskb_trim); 2492EXPORT_SYMBOL(___pskb_trim);
2465EXPORT_SYMBOL(__kfree_skb); 2493EXPORT_SYMBOL(__kfree_skb);
2466EXPORT_SYMBOL(kfree_skb); 2494EXPORT_SYMBOL(kfree_skb);
@@ -2497,3 +2525,4 @@ EXPORT_SYMBOL(skb_append_datato_frags);
2497 2525
2498EXPORT_SYMBOL_GPL(skb_to_sgvec); 2526EXPORT_SYMBOL_GPL(skb_to_sgvec);
2499EXPORT_SYMBOL_GPL(skb_cow_data); 2527EXPORT_SYMBOL_GPL(skb_cow_data);
2528EXPORT_SYMBOL_GPL(skb_partial_csum_set);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index ebe59d98721a..287a62bc2e0f 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -271,8 +271,6 @@ extern struct sk_buff *dccp_make_response(struct sock *sk,
271 271
272extern int dccp_connect(struct sock *sk); 272extern int dccp_connect(struct sock *sk);
273extern int dccp_disconnect(struct sock *sk, int flags); 273extern int dccp_disconnect(struct sock *sk, int flags);
274extern void dccp_hash(struct sock *sk);
275extern void dccp_unhash(struct sock *sk);
276extern int dccp_getsockopt(struct sock *sk, int level, int optname, 274extern int dccp_getsockopt(struct sock *sk, int level, int optname,
277 char __user *optval, int __user *optlen); 275 char __user *optval, int __user *optlen);
278extern int dccp_setsockopt(struct sock *sk, int level, int optname, 276extern int dccp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index c982ad88223d..474075adbde4 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -38,12 +38,6 @@
38 */ 38 */
39static struct socket *dccp_v4_ctl_socket; 39static struct socket *dccp_v4_ctl_socket;
40 40
41static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
42{
43 return inet_csk_get_port(&dccp_hashinfo, sk, snum,
44 inet_csk_bind_conflict);
45}
46
47int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 41int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
48{ 42{
49 struct inet_sock *inet = inet_sk(sk); 43 struct inet_sock *inet = inet_sk(sk);
@@ -408,8 +402,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
408 402
409 dccp_sync_mss(newsk, dst_mtu(dst)); 403 dccp_sync_mss(newsk, dst_mtu(dst));
410 404
411 __inet_hash_nolisten(&dccp_hashinfo, newsk); 405 __inet_hash_nolisten(newsk);
412 __inet_inherit_port(&dccp_hashinfo, sk, newsk); 406 __inet_inherit_port(sk, newsk);
413 407
414 return newsk; 408 return newsk;
415 409
@@ -898,6 +892,7 @@ static struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
898 .getsockopt = ip_getsockopt, 892 .getsockopt = ip_getsockopt,
899 .addr2sockaddr = inet_csk_addr2sockaddr, 893 .addr2sockaddr = inet_csk_addr2sockaddr,
900 .sockaddr_len = sizeof(struct sockaddr_in), 894 .sockaddr_len = sizeof(struct sockaddr_in),
895 .bind_conflict = inet_csk_bind_conflict,
901#ifdef CONFIG_COMPAT 896#ifdef CONFIG_COMPAT
902 .compat_setsockopt = compat_ip_setsockopt, 897 .compat_setsockopt = compat_ip_setsockopt,
903 .compat_getsockopt = compat_ip_getsockopt, 898 .compat_getsockopt = compat_ip_getsockopt,
@@ -937,10 +932,10 @@ static struct proto dccp_v4_prot = {
937 .sendmsg = dccp_sendmsg, 932 .sendmsg = dccp_sendmsg,
938 .recvmsg = dccp_recvmsg, 933 .recvmsg = dccp_recvmsg,
939 .backlog_rcv = dccp_v4_do_rcv, 934 .backlog_rcv = dccp_v4_do_rcv,
940 .hash = dccp_hash, 935 .hash = inet_hash,
941 .unhash = dccp_unhash, 936 .unhash = inet_unhash,
942 .accept = inet_csk_accept, 937 .accept = inet_csk_accept,
943 .get_port = dccp_v4_get_port, 938 .get_port = inet_csk_get_port,
944 .shutdown = dccp_shutdown, 939 .shutdown = dccp_shutdown,
945 .destroy = dccp_destroy_sock, 940 .destroy = dccp_destroy_sock,
946 .orphan_count = &dccp_orphan_count, 941 .orphan_count = &dccp_orphan_count,
@@ -948,6 +943,7 @@ static struct proto dccp_v4_prot = {
948 .obj_size = sizeof(struct dccp_sock), 943 .obj_size = sizeof(struct dccp_sock),
949 .rsk_prot = &dccp_request_sock_ops, 944 .rsk_prot = &dccp_request_sock_ops,
950 .twsk_prot = &dccp_timewait_sock_ops, 945 .twsk_prot = &dccp_timewait_sock_ops,
946 .hashinfo = &dccp_hashinfo,
951#ifdef CONFIG_COMPAT 947#ifdef CONFIG_COMPAT
952 .compat_setsockopt = compat_dccp_setsockopt, 948 .compat_setsockopt = compat_dccp_setsockopt,
953 .compat_getsockopt = compat_dccp_getsockopt, 949 .compat_getsockopt = compat_dccp_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ed0a0053a797..490333d47c7b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -39,21 +39,15 @@ static struct socket *dccp_v6_ctl_socket;
39static struct inet_connection_sock_af_ops dccp_ipv6_mapped; 39static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
40static struct inet_connection_sock_af_ops dccp_ipv6_af_ops; 40static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
41 41
42static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
43{
44 return inet_csk_get_port(&dccp_hashinfo, sk, snum,
45 inet6_csk_bind_conflict);
46}
47
48static void dccp_v6_hash(struct sock *sk) 42static void dccp_v6_hash(struct sock *sk)
49{ 43{
50 if (sk->sk_state != DCCP_CLOSED) { 44 if (sk->sk_state != DCCP_CLOSED) {
51 if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { 45 if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
52 dccp_hash(sk); 46 inet_hash(sk);
53 return; 47 return;
54 } 48 }
55 local_bh_disable(); 49 local_bh_disable();
56 __inet6_hash(&dccp_hashinfo, sk); 50 __inet6_hash(sk);
57 local_bh_enable(); 51 local_bh_enable();
58 } 52 }
59} 53}
@@ -630,8 +624,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
630 624
631 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; 625 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
632 626
633 __inet6_hash(&dccp_hashinfo, newsk); 627 __inet6_hash(newsk);
634 inet_inherit_port(&dccp_hashinfo, sk, newsk); 628 inet_inherit_port(sk, newsk);
635 629
636 return newsk; 630 return newsk;
637 631
@@ -1054,6 +1048,7 @@ static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
1054 .getsockopt = ipv6_getsockopt, 1048 .getsockopt = ipv6_getsockopt,
1055 .addr2sockaddr = inet6_csk_addr2sockaddr, 1049 .addr2sockaddr = inet6_csk_addr2sockaddr,
1056 .sockaddr_len = sizeof(struct sockaddr_in6), 1050 .sockaddr_len = sizeof(struct sockaddr_in6),
1051 .bind_conflict = inet6_csk_bind_conflict,
1057#ifdef CONFIG_COMPAT 1052#ifdef CONFIG_COMPAT
1058 .compat_setsockopt = compat_ipv6_setsockopt, 1053 .compat_setsockopt = compat_ipv6_setsockopt,
1059 .compat_getsockopt = compat_ipv6_getsockopt, 1054 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1123,9 +1118,9 @@ static struct proto dccp_v6_prot = {
1123 .recvmsg = dccp_recvmsg, 1118 .recvmsg = dccp_recvmsg,
1124 .backlog_rcv = dccp_v6_do_rcv, 1119 .backlog_rcv = dccp_v6_do_rcv,
1125 .hash = dccp_v6_hash, 1120 .hash = dccp_v6_hash,
1126 .unhash = dccp_unhash, 1121 .unhash = inet_unhash,
1127 .accept = inet_csk_accept, 1122 .accept = inet_csk_accept,
1128 .get_port = dccp_v6_get_port, 1123 .get_port = inet_csk_get_port,
1129 .shutdown = dccp_shutdown, 1124 .shutdown = dccp_shutdown,
1130 .destroy = dccp_v6_destroy_sock, 1125 .destroy = dccp_v6_destroy_sock,
1131 .orphan_count = &dccp_orphan_count, 1126 .orphan_count = &dccp_orphan_count,
@@ -1133,6 +1128,7 @@ static struct proto dccp_v6_prot = {
1133 .obj_size = sizeof(struct dccp6_sock), 1128 .obj_size = sizeof(struct dccp6_sock),
1134 .rsk_prot = &dccp6_request_sock_ops, 1129 .rsk_prot = &dccp6_request_sock_ops,
1135 .twsk_prot = &dccp6_timewait_sock_ops, 1130 .twsk_prot = &dccp6_timewait_sock_ops,
1131 .hashinfo = &dccp_hashinfo,
1136#ifdef CONFIG_COMPAT 1132#ifdef CONFIG_COMPAT
1137 .compat_setsockopt = compat_dccp_setsockopt, 1133 .compat_setsockopt = compat_dccp_setsockopt,
1138 .compat_getsockopt = compat_dccp_getsockopt, 1134 .compat_getsockopt = compat_dccp_getsockopt,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0bed4a6095b7..e3f5d37b84be 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -78,7 +78,7 @@ void dccp_set_state(struct sock *sk, const int state)
78 sk->sk_prot->unhash(sk); 78 sk->sk_prot->unhash(sk);
79 if (inet_csk(sk)->icsk_bind_hash != NULL && 79 if (inet_csk(sk)->icsk_bind_hash != NULL &&
80 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) 80 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
81 inet_put_port(&dccp_hashinfo, sk); 81 inet_put_port(sk);
82 /* fall through */ 82 /* fall through */
83 default: 83 default:
84 if (oldstate == DCCP_OPEN) 84 if (oldstate == DCCP_OPEN)
@@ -173,20 +173,6 @@ const char *dccp_state_name(const int state)
173 173
174EXPORT_SYMBOL_GPL(dccp_state_name); 174EXPORT_SYMBOL_GPL(dccp_state_name);
175 175
176void dccp_hash(struct sock *sk)
177{
178 inet_hash(&dccp_hashinfo, sk);
179}
180
181EXPORT_SYMBOL_GPL(dccp_hash);
182
183void dccp_unhash(struct sock *sk)
184{
185 inet_unhash(&dccp_hashinfo, sk);
186}
187
188EXPORT_SYMBOL_GPL(dccp_unhash);
189
190int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 176int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
191{ 177{
192 struct dccp_sock *dp = dccp_sk(sk); 178 struct dccp_sock *dp = dccp_sk(sk);
@@ -268,7 +254,7 @@ int dccp_destroy_sock(struct sock *sk)
268 254
269 /* Clean up a referenced DCCP bind bucket. */ 255 /* Clean up a referenced DCCP bind bucket. */
270 if (inet_csk(sk)->icsk_bind_hash != NULL) 256 if (inet_csk(sk)->icsk_bind_hash != NULL)
271 inet_put_port(&dccp_hashinfo, sk); 257 inet_put_port(sk);
272 258
273 kfree(dp->dccps_service_list); 259 kfree(dp->dccps_service_list);
274 dp->dccps_service_list = NULL; 260 dp->dccps_service_list = NULL;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a2241060113b..8cd357f41283 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -547,8 +547,8 @@ int cipso_v4_doi_remove(u32 doi,
547 rcu_read_lock(); 547 rcu_read_lock();
548 list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list) 548 list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
549 if (dom_iter->valid) 549 if (dom_iter->valid)
550 netlbl_domhsh_remove(dom_iter->domain, 550 netlbl_cfg_map_del(dom_iter->domain,
551 audit_info); 551 audit_info);
552 rcu_read_unlock(); 552 rcu_read_unlock();
553 cipso_v4_cache_invalidate(); 553 cipso_v4_cache_invalidate();
554 call_rcu(&doi_def->rcu, callback); 554 call_rcu(&doi_def->rcu, callback);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 35851c96bdfb..f5fba3f71c06 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2431,8 +2431,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2431 rtn_type(buf2, sizeof(buf2), 2431 rtn_type(buf2, sizeof(buf2),
2432 fa->fa_type)); 2432 fa->fa_type));
2433 if (fa->fa_tos) 2433 if (fa->fa_tos)
2434 seq_printf(seq, "tos =%d\n", 2434 seq_printf(seq, " tos=%d", fa->fa_tos);
2435 fa->fa_tos);
2436 seq_putc(seq, '\n'); 2435 seq_putc(seq, '\n');
2437 } 2436 }
2438 } 2437 }
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a7321a82df6d..a13c074dac09 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1015,7 +1015,8 @@ int icmp_rcv(struct sk_buff *skb)
1015 goto error; 1015 goto error;
1016 } 1016 }
1017 1017
1018 __skb_pull(skb, sizeof(*icmph)); 1018 if (!pskb_pull(skb, sizeof(*icmph)))
1019 goto error;
1019 1020
1020 icmph = icmp_hdr(skb); 1021 icmph = icmp_hdr(skb);
1021 1022
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index de5a41de191a..b189278c7bc1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,11 +78,9 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
78/* Obtain a reference to a local port for the given sock, 78/* Obtain a reference to a local port for the given sock,
79 * if snum is zero it means select any available local port. 79 * if snum is zero it means select any available local port.
80 */ 80 */
81int inet_csk_get_port(struct inet_hashinfo *hashinfo, 81int inet_csk_get_port(struct sock *sk, unsigned short snum)
82 struct sock *sk, unsigned short snum,
83 int (*bind_conflict)(const struct sock *sk,
84 const struct inet_bind_bucket *tb))
85{ 82{
83 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
86 struct inet_bind_hashbucket *head; 84 struct inet_bind_hashbucket *head;
87 struct hlist_node *node; 85 struct hlist_node *node;
88 struct inet_bind_bucket *tb; 86 struct inet_bind_bucket *tb;
@@ -142,7 +140,7 @@ tb_found:
142 goto success; 140 goto success;
143 } else { 141 } else {
144 ret = 1; 142 ret = 1;
145 if (bind_conflict(sk, tb)) 143 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
146 goto fail_unlock; 144 goto fail_unlock;
147 } 145 }
148 } 146 }
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 48d45008f749..9cac6c034abd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -66,8 +66,9 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
66/* 66/*
67 * Get rid of any references to a local port held by the given sock. 67 * Get rid of any references to a local port held by the given sock.
68 */ 68 */
69static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) 69static void __inet_put_port(struct sock *sk)
70{ 70{
71 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
71 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); 72 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
72 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 73 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
73 struct inet_bind_bucket *tb; 74 struct inet_bind_bucket *tb;
@@ -81,10 +82,10 @@ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
81 spin_unlock(&head->lock); 82 spin_unlock(&head->lock);
82} 83}
83 84
84void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) 85void inet_put_port(struct sock *sk)
85{ 86{
86 local_bh_disable(); 87 local_bh_disable();
87 __inet_put_port(hashinfo, sk); 88 __inet_put_port(sk);
88 local_bh_enable(); 89 local_bh_enable();
89} 90}
90 91
@@ -317,8 +318,9 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
317 inet->dport); 318 inet->dport);
318} 319}
319 320
320void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk) 321void __inet_hash_nolisten(struct sock *sk)
321{ 322{
323 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
322 struct hlist_head *list; 324 struct hlist_head *list;
323 rwlock_t *lock; 325 rwlock_t *lock;
324 struct inet_ehash_bucket *head; 326 struct inet_ehash_bucket *head;
@@ -337,13 +339,14 @@ void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk)
337} 339}
338EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 340EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
339 341
340void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) 342static void __inet_hash(struct sock *sk)
341{ 343{
344 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
342 struct hlist_head *list; 345 struct hlist_head *list;
343 rwlock_t *lock; 346 rwlock_t *lock;
344 347
345 if (sk->sk_state != TCP_LISTEN) { 348 if (sk->sk_state != TCP_LISTEN) {
346 __inet_hash_nolisten(hashinfo, sk); 349 __inet_hash_nolisten(sk);
347 return; 350 return;
348 } 351 }
349 352
@@ -357,13 +360,48 @@ void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
357 write_unlock(lock); 360 write_unlock(lock);
358 wake_up(&hashinfo->lhash_wait); 361 wake_up(&hashinfo->lhash_wait);
359} 362}
360EXPORT_SYMBOL_GPL(__inet_hash); 363
364void inet_hash(struct sock *sk)
365{
366 if (sk->sk_state != TCP_CLOSE) {
367 local_bh_disable();
368 __inet_hash(sk);
369 local_bh_enable();
370 }
371}
372EXPORT_SYMBOL_GPL(inet_hash);
373
374void inet_unhash(struct sock *sk)
375{
376 rwlock_t *lock;
377 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
378
379 if (sk_unhashed(sk))
380 goto out;
381
382 if (sk->sk_state == TCP_LISTEN) {
383 local_bh_disable();
384 inet_listen_wlock(hashinfo);
385 lock = &hashinfo->lhash_lock;
386 } else {
387 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
388 write_lock_bh(lock);
389 }
390
391 if (__sk_del_node_init(sk))
392 sock_prot_inuse_add(sk->sk_prot, -1);
393 write_unlock_bh(lock);
394out:
395 if (sk->sk_state == TCP_LISTEN)
396 wake_up(&hashinfo->lhash_wait);
397}
398EXPORT_SYMBOL_GPL(inet_unhash);
361 399
362int __inet_hash_connect(struct inet_timewait_death_row *death_row, 400int __inet_hash_connect(struct inet_timewait_death_row *death_row,
363 struct sock *sk, 401 struct sock *sk, u32 port_offset,
364 int (*check_established)(struct inet_timewait_death_row *, 402 int (*check_established)(struct inet_timewait_death_row *,
365 struct sock *, __u16, struct inet_timewait_sock **), 403 struct sock *, __u16, struct inet_timewait_sock **),
366 void (*hash)(struct inet_hashinfo *, struct sock *)) 404 void (*hash)(struct sock *sk))
367{ 405{
368 struct inet_hashinfo *hinfo = death_row->hashinfo; 406 struct inet_hashinfo *hinfo = death_row->hashinfo;
369 const unsigned short snum = inet_sk(sk)->num; 407 const unsigned short snum = inet_sk(sk)->num;
@@ -375,7 +413,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
375 if (!snum) { 413 if (!snum) {
376 int i, remaining, low, high, port; 414 int i, remaining, low, high, port;
377 static u32 hint; 415 static u32 hint;
378 u32 offset = hint + inet_sk_port_offset(sk); 416 u32 offset = hint + port_offset;
379 struct hlist_node *node; 417 struct hlist_node *node;
380 struct inet_timewait_sock *tw = NULL; 418 struct inet_timewait_sock *tw = NULL;
381 419
@@ -427,7 +465,7 @@ ok:
427 inet_bind_hash(sk, tb, port); 465 inet_bind_hash(sk, tb, port);
428 if (sk_unhashed(sk)) { 466 if (sk_unhashed(sk)) {
429 inet_sk(sk)->sport = htons(port); 467 inet_sk(sk)->sport = htons(port);
430 hash(hinfo, sk); 468 hash(sk);
431 } 469 }
432 spin_unlock(&head->lock); 470 spin_unlock(&head->lock);
433 471
@@ -444,7 +482,7 @@ ok:
444 tb = inet_csk(sk)->icsk_bind_hash; 482 tb = inet_csk(sk)->icsk_bind_hash;
445 spin_lock_bh(&head->lock); 483 spin_lock_bh(&head->lock);
446 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 484 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
447 hash(hinfo, sk); 485 hash(sk);
448 spin_unlock_bh(&head->lock); 486 spin_unlock_bh(&head->lock);
449 return 0; 487 return 0;
450 } else { 488 } else {
@@ -464,7 +502,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_connect);
464int inet_hash_connect(struct inet_timewait_death_row *death_row, 502int inet_hash_connect(struct inet_timewait_death_row *death_row,
465 struct sock *sk) 503 struct sock *sk)
466{ 504{
467 return __inet_hash_connect(death_row, sk, 505 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
468 __inet_check_established, __inet_hash_nolisten); 506 __inet_check_established, __inet_hash_nolisten);
469} 507}
470 508
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
index 749fa044eca5..85c680add6df 100644
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/net.h>
25 26
26#include <net/ip_vs.h> 27#include <net/ip_vs.h>
27 28
@@ -169,7 +170,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
169 */ 170 */
170 if (mark->cw == 0) { 171 if (mark->cw == 0) {
171 mark->cl = &svc->destinations; 172 mark->cl = &svc->destinations;
172 IP_VS_INFO("ip_vs_wrr_schedule(): " 173 IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
173 "no available servers\n"); 174 "no available servers\n");
174 dest = NULL; 175 dest = NULL;
175 goto out; 176 goto out;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a0d373bd9065..071e83a894ad 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1669,7 +1669,7 @@ void tcp_set_state(struct sock *sk, int state)
1669 sk->sk_prot->unhash(sk); 1669 sk->sk_prot->unhash(sk);
1670 if (inet_csk(sk)->icsk_bind_hash && 1670 if (inet_csk(sk)->icsk_bind_hash &&
1671 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) 1671 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1672 inet_put_port(&tcp_hashinfo, sk); 1672 inet_put_port(sk);
1673 /* fall through */ 1673 /* fall through */
1674 default: 1674 default:
1675 if (oldstate==TCP_ESTABLISHED) 1675 if (oldstate==TCP_ESTABLISHED)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77c1939a2b0d..63414ea427c5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -108,22 +108,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
108 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), 108 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109}; 109};
110 110
111static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
112{
113 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
114 inet_csk_bind_conflict);
115}
116
117static void tcp_v4_hash(struct sock *sk)
118{
119 inet_hash(&tcp_hashinfo, sk);
120}
121
122void tcp_unhash(struct sock *sk)
123{
124 inet_unhash(&tcp_hashinfo, sk);
125}
126
127static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 111static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
128{ 112{
129 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 113 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
@@ -1478,8 +1462,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1478 } 1462 }
1479#endif 1463#endif
1480 1464
1481 __inet_hash_nolisten(&tcp_hashinfo, newsk); 1465 __inet_hash_nolisten(newsk);
1482 __inet_inherit_port(&tcp_hashinfo, sk, newsk); 1466 __inet_inherit_port(sk, newsk);
1483 1467
1484 return newsk; 1468 return newsk;
1485 1469
@@ -1827,6 +1811,7 @@ struct inet_connection_sock_af_ops ipv4_specific = {
1827 .getsockopt = ip_getsockopt, 1811 .getsockopt = ip_getsockopt,
1828 .addr2sockaddr = inet_csk_addr2sockaddr, 1812 .addr2sockaddr = inet_csk_addr2sockaddr,
1829 .sockaddr_len = sizeof(struct sockaddr_in), 1813 .sockaddr_len = sizeof(struct sockaddr_in),
1814 .bind_conflict = inet_csk_bind_conflict,
1830#ifdef CONFIG_COMPAT 1815#ifdef CONFIG_COMPAT
1831 .compat_setsockopt = compat_ip_setsockopt, 1816 .compat_setsockopt = compat_ip_setsockopt,
1832 .compat_getsockopt = compat_ip_getsockopt, 1817 .compat_getsockopt = compat_ip_getsockopt,
@@ -1926,7 +1911,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
1926 1911
1927 /* Clean up a referenced TCP bind bucket. */ 1912 /* Clean up a referenced TCP bind bucket. */
1928 if (inet_csk(sk)->icsk_bind_hash) 1913 if (inet_csk(sk)->icsk_bind_hash)
1929 inet_put_port(&tcp_hashinfo, sk); 1914 inet_put_port(sk);
1930 1915
1931 /* 1916 /*
1932 * If sendmsg cached page exists, toss it. 1917 * If sendmsg cached page exists, toss it.
@@ -2435,9 +2420,9 @@ struct proto tcp_prot = {
2435 .getsockopt = tcp_getsockopt, 2420 .getsockopt = tcp_getsockopt,
2436 .recvmsg = tcp_recvmsg, 2421 .recvmsg = tcp_recvmsg,
2437 .backlog_rcv = tcp_v4_do_rcv, 2422 .backlog_rcv = tcp_v4_do_rcv,
2438 .hash = tcp_v4_hash, 2423 .hash = inet_hash,
2439 .unhash = tcp_unhash, 2424 .unhash = inet_unhash,
2440 .get_port = tcp_v4_get_port, 2425 .get_port = inet_csk_get_port,
2441 .enter_memory_pressure = tcp_enter_memory_pressure, 2426 .enter_memory_pressure = tcp_enter_memory_pressure,
2442 .sockets_allocated = &tcp_sockets_allocated, 2427 .sockets_allocated = &tcp_sockets_allocated,
2443 .orphan_count = &tcp_orphan_count, 2428 .orphan_count = &tcp_orphan_count,
@@ -2450,6 +2435,7 @@ struct proto tcp_prot = {
2450 .obj_size = sizeof(struct tcp_sock), 2435 .obj_size = sizeof(struct tcp_sock),
2451 .twsk_prot = &tcp_timewait_sock_ops, 2436 .twsk_prot = &tcp_timewait_sock_ops,
2452 .rsk_prot = &tcp_request_sock_ops, 2437 .rsk_prot = &tcp_request_sock_ops,
2438 .hashinfo = &tcp_hashinfo,
2453#ifdef CONFIG_COMPAT 2439#ifdef CONFIG_COMPAT
2454 .compat_setsockopt = compat_tcp_setsockopt, 2440 .compat_setsockopt = compat_tcp_setsockopt,
2455 .compat_getsockopt = compat_tcp_getsockopt, 2441 .compat_getsockopt = compat_tcp_getsockopt,
@@ -2467,7 +2453,6 @@ void __init tcp_v4_init(struct net_proto_family *ops)
2467EXPORT_SYMBOL(ipv4_specific); 2453EXPORT_SYMBOL(ipv4_specific);
2468EXPORT_SYMBOL(tcp_hashinfo); 2454EXPORT_SYMBOL(tcp_hashinfo);
2469EXPORT_SYMBOL(tcp_prot); 2455EXPORT_SYMBOL(tcp_prot);
2470EXPORT_SYMBOL(tcp_unhash);
2471EXPORT_SYMBOL(tcp_v4_conn_request); 2456EXPORT_SYMBOL(tcp_v4_conn_request);
2472EXPORT_SYMBOL(tcp_v4_connect); 2457EXPORT_SYMBOL(tcp_v4_connect);
2473EXPORT_SYMBOL(tcp_v4_do_rcv); 2458EXPORT_SYMBOL(tcp_v4_do_rcv);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e093a7b59e18..b47030ba162b 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -102,7 +102,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
102 102
103 XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; 103 XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
104 104
105 if (!pskb_may_pull(skb, phlen)); 105 if (!pskb_may_pull(skb, phlen))
106 goto out; 106 goto out;
107 __skb_pull(skb, phlen); 107 __skb_pull(skb, phlen);
108 } 108 }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index cbb5b9cf84ad..121d517bf91c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -683,7 +683,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
683 } 683 }
684 } 684 }
685 685
686 __skb_pull(skb, sizeof(*hdr)); 686 if (!pskb_pull(skb, sizeof(*hdr)))
687 goto discard_it;
687 688
688 hdr = icmp6_hdr(skb); 689 hdr = icmp6_hdr(skb);
689 690
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d325a9958909..99fd25f7f005 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -22,9 +22,9 @@
22#include <net/inet6_hashtables.h> 22#include <net/inet6_hashtables.h>
23#include <net/ip.h> 23#include <net/ip.h>
24 24
25void __inet6_hash(struct inet_hashinfo *hashinfo, 25void __inet6_hash(struct sock *sk)
26 struct sock *sk)
27{ 26{
27 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
28 struct hlist_head *list; 28 struct hlist_head *list;
29 rwlock_t *lock; 29 rwlock_t *lock;
30 30
@@ -236,7 +236,7 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
236int inet6_hash_connect(struct inet_timewait_death_row *death_row, 236int inet6_hash_connect(struct inet_timewait_death_row *death_row,
237 struct sock *sk) 237 struct sock *sk)
238{ 238{
239 return __inet_hash_connect(death_row, sk, 239 return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk),
240 __inet6_check_established, __inet6_hash); 240 __inet6_check_established, __inet6_hash);
241} 241}
242 242
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 59d0029e93a7..12750f2b05ab 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -86,12 +86,6 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific;
86static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; 86static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
87#endif 87#endif
88 88
89static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
90{
91 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
92 inet6_csk_bind_conflict);
93}
94
95static void tcp_v6_hash(struct sock *sk) 89static void tcp_v6_hash(struct sock *sk)
96{ 90{
97 if (sk->sk_state != TCP_CLOSE) { 91 if (sk->sk_state != TCP_CLOSE) {
@@ -100,7 +94,7 @@ static void tcp_v6_hash(struct sock *sk)
100 return; 94 return;
101 } 95 }
102 local_bh_disable(); 96 local_bh_disable();
103 __inet6_hash(&tcp_hashinfo, sk); 97 __inet6_hash(sk);
104 local_bh_enable(); 98 local_bh_enable();
105 } 99 }
106} 100}
@@ -1504,8 +1498,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1504 } 1498 }
1505#endif 1499#endif
1506 1500
1507 __inet6_hash(&tcp_hashinfo, newsk); 1501 __inet6_hash(newsk);
1508 inet_inherit_port(&tcp_hashinfo, sk, newsk); 1502 inet_inherit_port(sk, newsk);
1509 1503
1510 return newsk; 1504 return newsk;
1511 1505
@@ -1833,6 +1827,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = {
1833 .getsockopt = ipv6_getsockopt, 1827 .getsockopt = ipv6_getsockopt,
1834 .addr2sockaddr = inet6_csk_addr2sockaddr, 1828 .addr2sockaddr = inet6_csk_addr2sockaddr,
1835 .sockaddr_len = sizeof(struct sockaddr_in6), 1829 .sockaddr_len = sizeof(struct sockaddr_in6),
1830 .bind_conflict = inet6_csk_bind_conflict,
1836#ifdef CONFIG_COMPAT 1831#ifdef CONFIG_COMPAT
1837 .compat_setsockopt = compat_ipv6_setsockopt, 1832 .compat_setsockopt = compat_ipv6_setsockopt,
1838 .compat_getsockopt = compat_ipv6_getsockopt, 1833 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -1864,6 +1859,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
1864 .getsockopt = ipv6_getsockopt, 1859 .getsockopt = ipv6_getsockopt,
1865 .addr2sockaddr = inet6_csk_addr2sockaddr, 1860 .addr2sockaddr = inet6_csk_addr2sockaddr,
1866 .sockaddr_len = sizeof(struct sockaddr_in6), 1861 .sockaddr_len = sizeof(struct sockaddr_in6),
1862 .bind_conflict = inet6_csk_bind_conflict,
1867#ifdef CONFIG_COMPAT 1863#ifdef CONFIG_COMPAT
1868 .compat_setsockopt = compat_ipv6_setsockopt, 1864 .compat_setsockopt = compat_ipv6_setsockopt,
1869 .compat_getsockopt = compat_ipv6_getsockopt, 1865 .compat_getsockopt = compat_ipv6_getsockopt,
@@ -2127,8 +2123,8 @@ struct proto tcpv6_prot = {
2127 .recvmsg = tcp_recvmsg, 2123 .recvmsg = tcp_recvmsg,
2128 .backlog_rcv = tcp_v6_do_rcv, 2124 .backlog_rcv = tcp_v6_do_rcv,
2129 .hash = tcp_v6_hash, 2125 .hash = tcp_v6_hash,
2130 .unhash = tcp_unhash, 2126 .unhash = inet_unhash,
2131 .get_port = tcp_v6_get_port, 2127 .get_port = inet_csk_get_port,
2132 .enter_memory_pressure = tcp_enter_memory_pressure, 2128 .enter_memory_pressure = tcp_enter_memory_pressure,
2133 .sockets_allocated = &tcp_sockets_allocated, 2129 .sockets_allocated = &tcp_sockets_allocated,
2134 .memory_allocated = &tcp_memory_allocated, 2130 .memory_allocated = &tcp_memory_allocated,
@@ -2141,6 +2137,7 @@ struct proto tcpv6_prot = {
2141 .obj_size = sizeof(struct tcp6_sock), 2137 .obj_size = sizeof(struct tcp6_sock),
2142 .twsk_prot = &tcp6_timewait_sock_ops, 2138 .twsk_prot = &tcp6_timewait_sock_ops,
2143 .rsk_prot = &tcp6_request_sock_ops, 2139 .rsk_prot = &tcp6_request_sock_ops,
2140 .hashinfo = &tcp_hashinfo,
2144#ifdef CONFIG_COMPAT 2141#ifdef CONFIG_COMPAT
2145 .compat_setsockopt = compat_tcp_setsockopt, 2142 .compat_setsockopt = compat_tcp_setsockopt,
2146 .compat_getsockopt = compat_tcp_getsockopt, 2143 .compat_getsockopt = compat_tcp_getsockopt,
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 09c255002e56..45c7c0c3875e 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -1,6 +1,5 @@
1config MAC80211 1config MAC80211
2 tristate "Generic IEEE 802.11 Networking Stack (mac80211)" 2 tristate "Generic IEEE 802.11 Networking Stack (mac80211)"
3 depends on EXPERIMENTAL
4 select CRYPTO 3 select CRYPTO
5 select CRYPTO_ECB 4 select CRYPTO_ECB
6 select CRYPTO_ARC4 5 select CRYPTO_ARC4
@@ -98,6 +97,18 @@ config MAC80211_DEBUGFS
98 97
99 Say N unless you know you need this. 98 Say N unless you know you need this.
100 99
100config MAC80211_DEBUG_PACKET_ALIGNMENT
101 bool "Enable packet alignment debugging"
102 depends on MAC80211
103 help
104 This option is recommended for driver authors and strongly
105 discouraged for everybody else, it will trigger a warning
106 when a driver hands mac80211 a buffer that is aligned in
107 a way that will cause problems with the IP stack on some
108 architectures.
109
110 Say N unless you're writing a mac80211 based driver.
111
101config MAC80211_DEBUG 112config MAC80211_DEBUG
102 bool "Enable debugging output" 113 bool "Enable debugging output"
103 depends on MAC80211 114 depends on MAC80211
diff --git a/net/mac80211/ieee80211.c b/net/mac80211/ieee80211.c
index 5dcc2d61551f..67b7c75c430d 100644
--- a/net/mac80211/ieee80211.c
+++ b/net/mac80211/ieee80211.c
@@ -1344,17 +1344,17 @@ static int __init ieee80211_init(void)
1344 1344
1345 ret = rc80211_simple_init(); 1345 ret = rc80211_simple_init();
1346 if (ret) 1346 if (ret)
1347 goto fail; 1347 goto out;
1348 1348
1349 ret = rc80211_pid_init(); 1349 ret = rc80211_pid_init();
1350 if (ret) 1350 if (ret)
1351 goto fail_simple; 1351 goto out_cleanup_simple;
1352 1352
1353 ret = ieee80211_wme_register(); 1353 ret = ieee80211_wme_register();
1354 if (ret) { 1354 if (ret) {
1355 printk(KERN_DEBUG "ieee80211_init: failed to " 1355 printk(KERN_DEBUG "ieee80211_init: failed to "
1356 "initialize WME (err=%d)\n", ret); 1356 "initialize WME (err=%d)\n", ret);
1357 goto fail_pid; 1357 goto out_cleanup_pid;
1358 } 1358 }
1359 1359
1360 ieee80211_debugfs_netdev_init(); 1360 ieee80211_debugfs_netdev_init();
@@ -1362,11 +1362,11 @@ static int __init ieee80211_init(void)
1362 1362
1363 return 0; 1363 return 0;
1364 1364
1365 fail_pid: 1365 out_cleanup_pid:
1366 rc80211_simple_exit();
1367 fail_simple:
1368 rc80211_pid_exit(); 1366 rc80211_pid_exit();
1369 fail: 1367 out_cleanup_simple:
1368 rc80211_simple_exit();
1369 out:
1370 return ret; 1370 return ret;
1371} 1371}
1372 1372
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index 554c4baed6fb..c339571632b2 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -538,7 +538,7 @@ int __init rc80211_pid_init(void)
538 return ieee80211_rate_control_register(&mac80211_rcpid); 538 return ieee80211_rate_control_register(&mac80211_rcpid);
539} 539}
540 540
541void __exit rc80211_pid_exit(void) 541void rc80211_pid_exit(void)
542{ 542{
543 ieee80211_rate_control_unregister(&mac80211_rcpid); 543 ieee80211_rate_control_unregister(&mac80211_rcpid);
544} 544}
diff --git a/net/mac80211/rc80211_simple.c b/net/mac80211/rc80211_simple.c
index 934676d687d6..9a78b116acff 100644
--- a/net/mac80211/rc80211_simple.c
+++ b/net/mac80211/rc80211_simple.c
@@ -389,7 +389,7 @@ int __init rc80211_simple_init(void)
389 return ieee80211_rate_control_register(&mac80211_rcsimple); 389 return ieee80211_rate_control_register(&mac80211_rcsimple);
390} 390}
391 391
392void __exit rc80211_simple_exit(void) 392void rc80211_simple_exit(void)
393{ 393{
394 ieee80211_rate_control_unregister(&mac80211_rcsimple); 394 ieee80211_rate_control_unregister(&mac80211_rcsimple);
395} 395}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index d44c87269bcb..535407d07fa4 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -340,11 +340,15 @@ static u32 ieee80211_rx_load_stats(struct ieee80211_local *local,
340 return load; 340 return load;
341} 341}
342 342
343#ifdef CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT
343static ieee80211_txrx_result 344static ieee80211_txrx_result
344ieee80211_rx_h_verify_ip_alignment(struct ieee80211_txrx_data *rx) 345ieee80211_rx_h_verify_ip_alignment(struct ieee80211_txrx_data *rx)
345{ 346{
346 int hdrlen; 347 int hdrlen;
347 348
349 if (!WLAN_FC_DATA_PRESENT(rx->fc))
350 return TXRX_CONTINUE;
351
348 /* 352 /*
349 * Drivers are required to align the payload data in a way that 353 * Drivers are required to align the payload data in a way that
350 * guarantees that the contained IP header is aligned to a four- 354 * guarantees that the contained IP header is aligned to a four-
@@ -371,11 +375,14 @@ ieee80211_rx_h_verify_ip_alignment(struct ieee80211_txrx_data *rx)
371 375
372 return TXRX_CONTINUE; 376 return TXRX_CONTINUE;
373} 377}
378#endif
374 379
375ieee80211_rx_handler ieee80211_rx_pre_handlers[] = 380ieee80211_rx_handler ieee80211_rx_pre_handlers[] =
376{ 381{
377 ieee80211_rx_h_parse_qos, 382 ieee80211_rx_h_parse_qos,
383#ifdef CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT
378 ieee80211_rx_h_verify_ip_alignment, 384 ieee80211_rx_h_verify_ip_alignment,
385#endif
379 NULL 386 NULL
380}; 387};
381 388
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index becf91a952ae..c7ad64d664ad 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -90,7 +90,7 @@ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1
90 * safely. 90 * safely.
91 * 91 *
92 */ 92 */
93static void netlbl_cipsov4_doi_free(struct rcu_head *entry) 93void netlbl_cipsov4_doi_free(struct rcu_head *entry)
94{ 94{
95 struct cipso_v4_doi *ptr; 95 struct cipso_v4_doi *ptr;
96 96
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
index f03cf9b78286..220cb9d06b49 100644
--- a/net/netlabel/netlabel_cipso_v4.h
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -163,4 +163,7 @@ enum {
163/* NetLabel protocol functions */ 163/* NetLabel protocol functions */
164int netlbl_cipsov4_genl_init(void); 164int netlbl_cipsov4_genl_init(void);
165 165
166/* Free the memory associated with a CIPSOv4 DOI definition */
167void netlbl_cipsov4_doi_free(struct rcu_head *entry);
168
166#endif 169#endif
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 3689956c3436..8220990ceb96 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -61,6 +61,7 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
61 struct netlbl_audit *audit_info); 61 struct netlbl_audit *audit_info);
62int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, 62int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
63 struct netlbl_audit *audit_info); 63 struct netlbl_audit *audit_info);
64int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
64int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info); 65int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
65struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain); 66struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
66int netlbl_domhsh_walk(u32 *skip_bkt, 67int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index c69e3e1f05c3..39793a1a93aa 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -30,6 +30,7 @@
30 30
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/types.h> 32#include <linux/types.h>
33#include <linux/audit.h>
33#include <net/ip.h> 34#include <net/ip.h>
34#include <net/netlabel.h> 35#include <net/netlabel.h>
35#include <net/cipso_ipv4.h> 36#include <net/cipso_ipv4.h>
@@ -38,10 +39,186 @@
38 39
39#include "netlabel_domainhash.h" 40#include "netlabel_domainhash.h"
40#include "netlabel_unlabeled.h" 41#include "netlabel_unlabeled.h"
42#include "netlabel_cipso_v4.h"
41#include "netlabel_user.h" 43#include "netlabel_user.h"
42#include "netlabel_mgmt.h" 44#include "netlabel_mgmt.h"
43 45
44/* 46/*
47 * Configuration Functions
48 */
49
50/**
51 * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
52 * @domain: the domain mapping to remove
53 * @audit_info: NetLabel audit information
54 *
55 * Description:
56 * Removes a NetLabel/LSM domain mapping. A @domain value of NULL causes the
57 * default domain mapping to be removed. Returns zero on success, negative
58 * values on failure.
59 *
60 */
61int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info)
62{
63 return netlbl_domhsh_remove(domain, audit_info);
64}
65
66/**
67 * netlbl_cfg_unlbl_add_map - Add an unlabeled NetLabel/LSM domain mapping
68 * @domain: the domain mapping to add
69 * @audit_info: NetLabel audit information
70 *
71 * Description:
72 * Adds a new unlabeled NetLabel/LSM domain mapping. A @domain value of NULL
73 * causes a new default domain mapping to be added. Returns zero on success,
74 * negative values on failure.
75 *
76 */
77int netlbl_cfg_unlbl_add_map(const char *domain,
78 struct netlbl_audit *audit_info)
79{
80 int ret_val = -ENOMEM;
81 struct netlbl_dom_map *entry;
82
83 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
84 if (entry == NULL)
85 goto cfg_unlbl_add_map_failure;
86 if (domain != NULL) {
87 entry->domain = kstrdup(domain, GFP_ATOMIC);
88 if (entry->domain == NULL)
89 goto cfg_unlbl_add_map_failure;
90 }
91 entry->type = NETLBL_NLTYPE_UNLABELED;
92
93 ret_val = netlbl_domhsh_add(entry, audit_info);
94 if (ret_val != 0)
95 goto cfg_unlbl_add_map_failure;
96
97 return 0;
98
99cfg_unlbl_add_map_failure:
100 if (entry != NULL)
101 kfree(entry->domain);
102 kfree(entry);
103 return ret_val;
104}
105
106/**
107 * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
108 * @doi_def: the DOI definition
109 * @audit_info: NetLabel audit information
110 *
111 * Description:
112 * Add a new CIPSOv4 DOI definition to the NetLabel subsystem. Returns zero on
113 * success, negative values on failure.
114 *
115 */
116int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
117 struct netlbl_audit *audit_info)
118{
119 int ret_val;
120 const char *type_str;
121 struct audit_buffer *audit_buf;
122
123 ret_val = cipso_v4_doi_add(doi_def);
124
125 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
126 audit_info);
127 if (audit_buf != NULL) {
128 switch (doi_def->type) {
129 case CIPSO_V4_MAP_STD:
130 type_str = "std";
131 break;
132 case CIPSO_V4_MAP_PASS:
133 type_str = "pass";
134 break;
135 default:
136 type_str = "(unknown)";
137 }
138 audit_log_format(audit_buf,
139 " cipso_doi=%u cipso_type=%s res=%u",
140 doi_def->doi,
141 type_str,
142 ret_val == 0 ? 1 : 0);
143 audit_log_end(audit_buf);
144 }
145
146 return ret_val;
147}
148
149/**
150 * netlbl_cfg_cipsov4_add_map - Add a new CIPSOv4 DOI definition and mapping
151 * @doi_def: the DOI definition
152 * @domain: the domain mapping to add
153 * @audit_info: NetLabel audit information
154 *
155 * Description:
156 * Add a new CIPSOv4 DOI definition and NetLabel/LSM domain mapping for this
157 * new DOI definition to the NetLabel subsystem. A @domain value of NULL adds
158 * a new default domain mapping. Returns zero on success, negative values on
159 * failure.
160 *
161 */
162int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
163 const char *domain,
164 struct netlbl_audit *audit_info)
165{
166 int ret_val = -ENOMEM;
167 struct netlbl_dom_map *entry;
168
169 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
170 if (entry == NULL)
171 goto cfg_cipsov4_add_map_failure;
172 if (domain != NULL) {
173 entry->domain = kstrdup(domain, GFP_ATOMIC);
174 if (entry->domain == NULL)
175 goto cfg_cipsov4_add_map_failure;
176 }
177 entry->type = NETLBL_NLTYPE_CIPSOV4;
178 entry->type_def.cipsov4 = doi_def;
179
180 /* Grab a RCU read lock here so nothing happens to the doi_def variable
181 * between adding it to the CIPSOv4 protocol engine and adding a
182 * domain mapping for it. */
183
184 rcu_read_lock();
185 ret_val = netlbl_cfg_cipsov4_add(doi_def, audit_info);
186 if (ret_val != 0)
187 goto cfg_cipsov4_add_map_failure_unlock;
188 ret_val = netlbl_domhsh_add(entry, audit_info);
189 if (ret_val != 0)
190 goto cfg_cipsov4_add_map_failure_remove_doi;
191 rcu_read_unlock();
192
193 return 0;
194
195cfg_cipsov4_add_map_failure_remove_doi:
196 cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free);
197cfg_cipsov4_add_map_failure_unlock:
198 rcu_read_unlock();
199cfg_cipsov4_add_map_failure:
200 if (entry != NULL)
201 kfree(entry->domain);
202 kfree(entry);
203 return ret_val;
204}
205
206/**
207 * netlbl_cfg_cipsov4_del - Removean existing CIPSOv4 DOI definition
208 * @doi: the CIPSO DOI value
209 * @audit_info: NetLabel audit information
210 *
211 * Description:
212 * Removes an existing CIPSOv4 DOI definition from the NetLabel subsystem.
213 * Returns zero on success, negative values on failure.
214 *
215 */
216int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
217{
218 return cipso_v4_doi_remove(doi, audit_info, netlbl_cipsov4_doi_free);
219}
220
221/*
45 * Security Attribute Functions 222 * Security Attribute Functions
46 */ 223 */
47 224
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 6562f868e82f..1a47f5d1be17 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -340,7 +340,7 @@ EXPORT_SYMBOL(rfkill_allocate);
340 * rfkill_free - Mark rfkill structure for deletion 340 * rfkill_free - Mark rfkill structure for deletion
341 * @rfkill: rfkill structure to be destroyed 341 * @rfkill: rfkill structure to be destroyed
342 * 342 *
343 * Decrements reference count of rfkill structure so it is destoryed. 343 * Decrements reference count of rfkill structure so it is destroyed.
344 * Note that rfkill_free() should _not_ be called after rfkill_unregister(). 344 * Note that rfkill_free() should _not_ be called after rfkill_unregister().
345 */ 345 */
346void rfkill_free(struct rfkill *rfkill) 346void rfkill_free(struct rfkill *rfkill)
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5a7f6a3060fc..971b867e0484 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -19,6 +19,7 @@
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/ip.h> 20#include <linux/ip.h>
21#include <linux/ipv6.h> 21#include <linux/ipv6.h>
22#include <linux/if_vlan.h>
22 23
23#include <net/pkt_cls.h> 24#include <net/pkt_cls.h>
24#include <net/ip.h> 25#include <net/ip.h>
@@ -270,6 +271,15 @@ static u32 flow_get_skgid(const struct sk_buff *skb)
270 return 0; 271 return 0;
271} 272}
272 273
274static u32 flow_get_vlan_tag(const struct sk_buff *skb)
275{
276 u16 uninitialized_var(tag);
277
278 if (vlan_get_tag(skb, &tag) < 0)
279 return 0;
280 return tag & VLAN_VID_MASK;
281}
282
273static u32 flow_key_get(const struct sk_buff *skb, int key) 283static u32 flow_key_get(const struct sk_buff *skb, int key)
274{ 284{
275 switch (key) { 285 switch (key) {
@@ -305,6 +315,8 @@ static u32 flow_key_get(const struct sk_buff *skb, int key)
305 return flow_get_skuid(skb); 315 return flow_get_skuid(skb);
306 case FLOW_KEY_SKGID: 316 case FLOW_KEY_SKGID:
307 return flow_get_skgid(skb); 317 return flow_get_skgid(skb);
318 case FLOW_KEY_VLAN_TAG:
319 return flow_get_vlan_tag(skb);
308 default: 320 default:
309 WARN_ON(1); 321 WARN_ON(1);
310 return 0; 322 return 0;
@@ -402,12 +414,13 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,
402 414
403 if (tb[TCA_FLOW_KEYS]) { 415 if (tb[TCA_FLOW_KEYS]) {
404 keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); 416 keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
405 if (fls(keymask) - 1 > FLOW_KEY_MAX)
406 return -EOPNOTSUPP;
407 417
408 nkeys = hweight32(keymask); 418 nkeys = hweight32(keymask);
409 if (nkeys == 0) 419 if (nkeys == 0)
410 return -EINVAL; 420 return -EINVAL;
421
422 if (fls(keymask) - 1 > FLOW_KEY_MAX)
423 return -EOPNOTSUPP;
411 } 424 }
412 425
413 err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); 426 err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
@@ -594,11 +607,11 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
594 607
595 if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) 608 if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
596 goto nla_put_failure; 609 goto nla_put_failure;
597 610#ifdef CONFIG_NET_EMATCH
598 if (f->ematches.hdr.nmatches && 611 if (f->ematches.hdr.nmatches &&
599 tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) 612 tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
600 goto nla_put_failure; 613 goto nla_put_failure;
601 614#endif
602 nla_nest_end(skb, nest); 615 nla_nest_end(skb, nest);
603 616
604 if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) 617 if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a1e5619b1876..2a7e648fbcf4 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -65,6 +65,7 @@
65#include <linux/string.h> 65#include <linux/string.h>
66#include <linux/skbuff.h> 66#include <linux/skbuff.h>
67#include <linux/random.h> 67#include <linux/random.h>
68#include <linux/if_vlan.h>
68#include <linux/tc_ematch/tc_em_meta.h> 69#include <linux/tc_ematch/tc_em_meta.h>
69#include <net/dst.h> 70#include <net/dst.h>
70#include <net/route.h> 71#include <net/route.h>
@@ -170,6 +171,21 @@ META_COLLECTOR(var_dev)
170} 171}
171 172
172/************************************************************************** 173/**************************************************************************
174 * vlan tag
175 **************************************************************************/
176
177META_COLLECTOR(int_vlan_tag)
178{
179 unsigned short uninitialized_var(tag);
180 if (vlan_get_tag(skb, &tag) < 0)
181 *err = -1;
182 else
183 dst->value = tag;
184}
185
186
187
188/**************************************************************************
173 * skb attributes 189 * skb attributes
174 **************************************************************************/ 190 **************************************************************************/
175 191
@@ -520,6 +536,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
520 [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo), 536 [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
521 [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), 537 [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
522 [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), 538 [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
539 [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
523 } 540 }
524}; 541};
525 542
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 97e6ebd14500..ae367c82e512 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -420,15 +420,15 @@ struct sctp_shared_key *sctp_auth_get_shkey(
420 const struct sctp_association *asoc, 420 const struct sctp_association *asoc,
421 __u16 key_id) 421 __u16 key_id)
422{ 422{
423 struct sctp_shared_key *key = NULL; 423 struct sctp_shared_key *key;
424 424
425 /* First search associations set of endpoint pair shared keys */ 425 /* First search associations set of endpoint pair shared keys */
426 key_for_each(key, &asoc->endpoint_shared_keys) { 426 key_for_each(key, &asoc->endpoint_shared_keys) {
427 if (key->key_id == key_id) 427 if (key->key_id == key_id)
428 break; 428 return key;
429 } 429 }
430 430
431 return key; 431 return NULL;
432} 432}
433 433
434/* 434/*
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 61cbd5a8dd0c..f98658782d4f 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -537,7 +537,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
537 * 537 *
538 * This means that if we only want to abort associations 538 * This means that if we only want to abort associations
539 * in an authenticated way (i.e AUTH+ABORT), then we 539 * in an authenticated way (i.e AUTH+ABORT), then we
540 * can't destory this association just becuase the packet 540 * can't destroy this association just becuase the packet
541 * was malformed. 541 * was malformed.
542 */ 542 */
543 if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc)) 543 if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
@@ -3865,6 +3865,10 @@ sctp_disposition_t sctp_sf_eat_auth(const struct sctp_endpoint *ep,
3865 struct sctp_chunk *err_chunk; 3865 struct sctp_chunk *err_chunk;
3866 sctp_ierror_t error; 3866 sctp_ierror_t error;
3867 3867
3868 /* Make sure that the peer has AUTH capable */
3869 if (!asoc->peer.auth_capable)
3870 return sctp_sf_unk_chunk(ep, asoc, type, arg, commands);
3871
3868 if (!sctp_vtag_verify(chunk, asoc)) { 3872 if (!sctp_vtag_verify(chunk, asoc)) {
3869 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, 3873 sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
3870 SCTP_NULL()); 3874 SCTP_NULL());
@@ -4130,7 +4134,7 @@ static sctp_disposition_t sctp_sf_abort_violation(
4130 * 4134 *
4131 * This means that if we only want to abort associations 4135 * This means that if we only want to abort associations
4132 * in an authenticated way (i.e AUTH+ABORT), then we 4136 * in an authenticated way (i.e AUTH+ABORT), then we
4133 * can't destory this association just becuase the packet 4137 * can't destroy this association just becuase the packet
4134 * was malformed. 4138 * was malformed.
4135 */ 4139 */
4136 if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc)) 4140 if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5c69a725e530..92e1dbe50947 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
11 auth.o auth_null.o auth_unix.o \ 11 auth.o auth_null.o auth_unix.o \
12 svc.o svcsock.o svcauth.o svcauth_unix.o \ 12 svc.o svcsock.o svcauth.o svcauth_unix.o \
13 rpcb_clnt.o timer.o xdr.o \ 13 rpcb_clnt.o timer.o xdr.o \
14 sunrpc_syms.o cache.o rpc_pipe.o 14 sunrpc_syms.o cache.o rpc_pipe.o \
15 svc_xprt.o
15sunrpc-$(CONFIG_PROC_FS) += stats.o 16sunrpc-$(CONFIG_PROC_FS) += stats.o
16sunrpc-$(CONFIG_SYSCTL) += sysctl.o 17sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73940df6c460..481f984e9a22 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd,
224 224
225 /* major/minor */ 225 /* major/minor */
226 len = qword_get(&mesg, buf, mlen); 226 len = qword_get(&mesg, buf, mlen);
227 if (len < 0) 227 if (len <= 0)
228 goto out; 228 goto out;
229 if (len == 0) { 229 rsii.major_status = simple_strtoul(buf, &ep, 10);
230 if (*ep)
231 goto out;
232 len = qword_get(&mesg, buf, mlen);
233 if (len <= 0)
234 goto out;
235 rsii.minor_status = simple_strtoul(buf, &ep, 10);
236 if (*ep)
230 goto out; 237 goto out;
231 } else {
232 rsii.major_status = simple_strtoul(buf, &ep, 10);
233 if (*ep)
234 goto out;
235 len = qword_get(&mesg, buf, mlen);
236 if (len <= 0)
237 goto out;
238 rsii.minor_status = simple_strtoul(buf, &ep, 10);
239 if (*ep)
240 goto out;
241 238
242 /* out_handle */ 239 /* out_handle */
243 len = qword_get(&mesg, buf, mlen); 240 len = qword_get(&mesg, buf, mlen);
244 if (len < 0) 241 if (len < 0)
245 goto out; 242 goto out;
246 status = -ENOMEM; 243 status = -ENOMEM;
247 if (dup_to_netobj(&rsii.out_handle, buf, len)) 244 if (dup_to_netobj(&rsii.out_handle, buf, len))
248 goto out; 245 goto out;
249 246
250 /* out_token */ 247 /* out_token */
251 len = qword_get(&mesg, buf, mlen); 248 len = qword_get(&mesg, buf, mlen);
252 status = -EINVAL; 249 status = -EINVAL;
253 if (len < 0) 250 if (len < 0)
254 goto out; 251 goto out;
255 status = -ENOMEM; 252 status = -ENOMEM;
256 if (dup_to_netobj(&rsii.out_token, buf, len)) 253 if (dup_to_netobj(&rsii.out_token, buf, len))
257 goto out; 254 goto out;
258 }
259 rsii.h.expiry_time = expiry; 255 rsii.h.expiry_time = expiry;
260 rsip = rsi_update(&rsii, rsip); 256 rsip = rsi_update(&rsii, rsip);
261 status = 0; 257 status = 0;
@@ -975,6 +971,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
975 struct kvec *resv = &rqstp->rq_res.head[0]; 971 struct kvec *resv = &rqstp->rq_res.head[0];
976 struct xdr_netobj tmpobj; 972 struct xdr_netobj tmpobj;
977 struct rsi *rsip, rsikey; 973 struct rsi *rsip, rsikey;
974 int ret;
978 975
979 /* Read the verifier; should be NULL: */ 976 /* Read the verifier; should be NULL: */
980 *authp = rpc_autherr_badverf; 977 *authp = rpc_autherr_badverf;
@@ -1014,23 +1011,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
1014 /* No upcall result: */ 1011 /* No upcall result: */
1015 return SVC_DROP; 1012 return SVC_DROP;
1016 case 0: 1013 case 0:
1014 ret = SVC_DROP;
1017 /* Got an answer to the upcall; use it: */ 1015 /* Got an answer to the upcall; use it: */
1018 if (gss_write_init_verf(rqstp, rsip)) 1016 if (gss_write_init_verf(rqstp, rsip))
1019 return SVC_DROP; 1017 goto out;
1020 if (resv->iov_len + 4 > PAGE_SIZE) 1018 if (resv->iov_len + 4 > PAGE_SIZE)
1021 return SVC_DROP; 1019 goto out;
1022 svc_putnl(resv, RPC_SUCCESS); 1020 svc_putnl(resv, RPC_SUCCESS);
1023 if (svc_safe_putnetobj(resv, &rsip->out_handle)) 1021 if (svc_safe_putnetobj(resv, &rsip->out_handle))
1024 return SVC_DROP; 1022 goto out;
1025 if (resv->iov_len + 3 * 4 > PAGE_SIZE) 1023 if (resv->iov_len + 3 * 4 > PAGE_SIZE)
1026 return SVC_DROP; 1024 goto out;
1027 svc_putnl(resv, rsip->major_status); 1025 svc_putnl(resv, rsip->major_status);
1028 svc_putnl(resv, rsip->minor_status); 1026 svc_putnl(resv, rsip->minor_status);
1029 svc_putnl(resv, GSS_SEQ_WIN); 1027 svc_putnl(resv, GSS_SEQ_WIN);
1030 if (svc_safe_putnetobj(resv, &rsip->out_token)) 1028 if (svc_safe_putnetobj(resv, &rsip->out_token))
1031 return SVC_DROP; 1029 goto out;
1032 } 1030 }
1033 return SVC_COMPLETE; 1031 ret = SVC_COMPLETE;
1032out:
1033 cache_put(&rsip->h, &rsi_cache);
1034 return ret;
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1125,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1125 case RPC_GSS_PROC_DESTROY: 1126 case RPC_GSS_PROC_DESTROY:
1126 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1127 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
1127 goto auth_err; 1128 goto auth_err;
1129 rsci->h.expiry_time = get_seconds();
1128 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1130 set_bit(CACHE_NEGATIVE, &rsci->h.flags);
1129 if (resv->iov_len + 4 > PAGE_SIZE) 1131 if (resv->iov_len + 4 > PAGE_SIZE)
1130 goto drop; 1132 goto drop;
@@ -1386,19 +1388,26 @@ int
1386gss_svc_init(void) 1388gss_svc_init(void)
1387{ 1389{
1388 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); 1390 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
1389 if (rv == 0) { 1391 if (rv)
1390 cache_register(&rsc_cache); 1392 return rv;
1391 cache_register(&rsi_cache); 1393 rv = cache_register(&rsc_cache);
1392 } 1394 if (rv)
1395 goto out1;
1396 rv = cache_register(&rsi_cache);
1397 if (rv)
1398 goto out2;
1399 return 0;
1400out2:
1401 cache_unregister(&rsc_cache);
1402out1:
1403 svc_auth_unregister(RPC_AUTH_GSS);
1393 return rv; 1404 return rv;
1394} 1405}
1395 1406
1396void 1407void
1397gss_svc_shutdown(void) 1408gss_svc_shutdown(void)
1398{ 1409{
1399 if (cache_unregister(&rsc_cache)) 1410 cache_unregister(&rsc_cache);
1400 printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); 1411 cache_unregister(&rsi_cache);
1401 if (cache_unregister(&rsi_cache))
1402 printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
1403 svc_auth_unregister(RPC_AUTH_GSS); 1412 svc_auth_unregister(RPC_AUTH_GSS);
1404} 1413}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 73f053d0cc7a..636c8e04e0be 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail,
245 cache_put(h, detail); 245 cache_put(h, detail);
246 return rv; 246 return rv;
247} 247}
248EXPORT_SYMBOL(cache_check);
248 249
249/* 250/*
250 * caches need to be periodically cleaned. 251 * caches need to be periodically cleaned.
@@ -290,44 +291,78 @@ static const struct file_operations cache_flush_operations;
290static void do_cache_clean(struct work_struct *work); 291static void do_cache_clean(struct work_struct *work);
291static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); 292static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
292 293
293void cache_register(struct cache_detail *cd) 294static void remove_cache_proc_entries(struct cache_detail *cd)
294{ 295{
295 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); 296 if (cd->proc_ent == NULL)
296 if (cd->proc_ent) { 297 return;
297 struct proc_dir_entry *p; 298 if (cd->flush_ent)
298 cd->proc_ent->owner = cd->owner; 299 remove_proc_entry("flush", cd->proc_ent);
299 cd->channel_ent = cd->content_ent = NULL; 300 if (cd->channel_ent)
301 remove_proc_entry("channel", cd->proc_ent);
302 if (cd->content_ent)
303 remove_proc_entry("content", cd->proc_ent);
304 cd->proc_ent = NULL;
305 remove_proc_entry(cd->name, proc_net_rpc);
306}
300 307
301 p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, 308#ifdef CONFIG_PROC_FS
302 cd->proc_ent); 309static int create_cache_proc_entries(struct cache_detail *cd)
303 cd->flush_ent = p; 310{
304 if (p) { 311 struct proc_dir_entry *p;
305 p->proc_fops = &cache_flush_operations;
306 p->owner = cd->owner;
307 p->data = cd;
308 }
309 312
310 if (cd->cache_request || cd->cache_parse) { 313 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
311 p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, 314 if (cd->proc_ent == NULL)
312 cd->proc_ent); 315 goto out_nomem;
313 cd->channel_ent = p; 316 cd->proc_ent->owner = cd->owner;
314 if (p) { 317 cd->channel_ent = cd->content_ent = NULL;
315 p->proc_fops = &cache_file_operations; 318
316 p->owner = cd->owner; 319 p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent);
317 p->data = cd; 320 cd->flush_ent = p;
318 } 321 if (p == NULL)
319 } 322 goto out_nomem;
320 if (cd->cache_show) { 323 p->proc_fops = &cache_flush_operations;
321 p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, 324 p->owner = cd->owner;
322 cd->proc_ent); 325 p->data = cd;
323 cd->content_ent = p; 326
324 if (p) { 327 if (cd->cache_request || cd->cache_parse) {
325 p->proc_fops = &content_file_operations; 328 p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
326 p->owner = cd->owner; 329 cd->proc_ent);
327 p->data = cd; 330 cd->channel_ent = p;
328 } 331 if (p == NULL)
329 } 332 goto out_nomem;
333 p->proc_fops = &cache_file_operations;
334 p->owner = cd->owner;
335 p->data = cd;
330 } 336 }
337 if (cd->cache_show) {
338 p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
339 cd->proc_ent);
340 cd->content_ent = p;
341 if (p == NULL)
342 goto out_nomem;
343 p->proc_fops = &content_file_operations;
344 p->owner = cd->owner;
345 p->data = cd;
346 }
347 return 0;
348out_nomem:
349 remove_cache_proc_entries(cd);
350 return -ENOMEM;
351}
352#else /* CONFIG_PROC_FS */
353static int create_cache_proc_entries(struct cache_detail *cd)
354{
355 return 0;
356}
357#endif
358
359int cache_register(struct cache_detail *cd)
360{
361 int ret;
362
363 ret = create_cache_proc_entries(cd);
364 if (ret)
365 return ret;
331 rwlock_init(&cd->hash_lock); 366 rwlock_init(&cd->hash_lock);
332 INIT_LIST_HEAD(&cd->queue); 367 INIT_LIST_HEAD(&cd->queue);
333 spin_lock(&cache_list_lock); 368 spin_lock(&cache_list_lock);
@@ -341,9 +376,11 @@ void cache_register(struct cache_detail *cd)
341 376
342 /* start the cleaning process */ 377 /* start the cleaning process */
343 schedule_delayed_work(&cache_cleaner, 0); 378 schedule_delayed_work(&cache_cleaner, 0);
379 return 0;
344} 380}
381EXPORT_SYMBOL(cache_register);
345 382
346int cache_unregister(struct cache_detail *cd) 383void cache_unregister(struct cache_detail *cd)
347{ 384{
348 cache_purge(cd); 385 cache_purge(cd);
349 spin_lock(&cache_list_lock); 386 spin_lock(&cache_list_lock);
@@ -351,30 +388,23 @@ int cache_unregister(struct cache_detail *cd)
351 if (cd->entries || atomic_read(&cd->inuse)) { 388 if (cd->entries || atomic_read(&cd->inuse)) {
352 write_unlock(&cd->hash_lock); 389 write_unlock(&cd->hash_lock);
353 spin_unlock(&cache_list_lock); 390 spin_unlock(&cache_list_lock);
354 return -EBUSY; 391 goto out;
355 } 392 }
356 if (current_detail == cd) 393 if (current_detail == cd)
357 current_detail = NULL; 394 current_detail = NULL;
358 list_del_init(&cd->others); 395 list_del_init(&cd->others);
359 write_unlock(&cd->hash_lock); 396 write_unlock(&cd->hash_lock);
360 spin_unlock(&cache_list_lock); 397 spin_unlock(&cache_list_lock);
361 if (cd->proc_ent) { 398 remove_cache_proc_entries(cd);
362 if (cd->flush_ent)
363 remove_proc_entry("flush", cd->proc_ent);
364 if (cd->channel_ent)
365 remove_proc_entry("channel", cd->proc_ent);
366 if (cd->content_ent)
367 remove_proc_entry("content", cd->proc_ent);
368
369 cd->proc_ent = NULL;
370 remove_proc_entry(cd->name, proc_net_rpc);
371 }
372 if (list_empty(&cache_list)) { 399 if (list_empty(&cache_list)) {
373 /* module must be being unloaded so its safe to kill the worker */ 400 /* module must be being unloaded so its safe to kill the worker */
374 cancel_delayed_work_sync(&cache_cleaner); 401 cancel_delayed_work_sync(&cache_cleaner);
375 } 402 }
376 return 0; 403 return;
404out:
405 printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
377} 406}
407EXPORT_SYMBOL(cache_unregister);
378 408
379/* clean cache tries to find something to clean 409/* clean cache tries to find something to clean
380 * and cleans it. 410 * and cleans it.
@@ -489,6 +519,7 @@ void cache_flush(void)
489 while (cache_clean() != -1) 519 while (cache_clean() != -1)
490 cond_resched(); 520 cond_resched();
491} 521}
522EXPORT_SYMBOL(cache_flush);
492 523
493void cache_purge(struct cache_detail *detail) 524void cache_purge(struct cache_detail *detail)
494{ 525{
@@ -497,7 +528,7 @@ void cache_purge(struct cache_detail *detail)
497 cache_flush(); 528 cache_flush();
498 detail->flush_time = 1; 529 detail->flush_time = 1;
499} 530}
500 531EXPORT_SYMBOL(cache_purge);
501 532
502 533
503/* 534/*
@@ -634,13 +665,13 @@ void cache_clean_deferred(void *owner)
634/* 665/*
635 * communicate with user-space 666 * communicate with user-space
636 * 667 *
637 * We have a magic /proc file - /proc/sunrpc/cache 668 * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
638 * On read, you get a full request, or block 669 * On read, you get a full request, or block.
639 * On write, an update request is processed 670 * On write, an update request is processed.
640 * Poll works if anything to read, and always allows write 671 * Poll works if anything to read, and always allows write.
641 * 672 *
642 * Implemented by linked list of requests. Each open file has 673 * Implemented by linked list of requests. Each open file has
643 * a ->private that also exists in this list. New request are added 674 * a ->private that also exists in this list. New requests are added
644 * to the end and may wakeup and preceding readers. 675 * to the end and may wakeup and preceding readers.
645 * New readers are added to the head. If, on read, an item is found with 676 * New readers are added to the head. If, on read, an item is found with
646 * CACHE_UPCALLING clear, we free it from the list. 677 * CACHE_UPCALLING clear, we free it from the list.
@@ -963,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str)
963 *bpp = bp; 994 *bpp = bp;
964 *lp = len; 995 *lp = len;
965} 996}
997EXPORT_SYMBOL(qword_add);
966 998
967void qword_addhex(char **bpp, int *lp, char *buf, int blen) 999void qword_addhex(char **bpp, int *lp, char *buf, int blen)
968{ 1000{
@@ -991,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
991 *bpp = bp; 1023 *bpp = bp;
992 *lp = len; 1024 *lp = len;
993} 1025}
1026EXPORT_SYMBOL(qword_addhex);
994 1027
995static void warn_no_listener(struct cache_detail *detail) 1028static void warn_no_listener(struct cache_detail *detail)
996{ 1029{
@@ -1113,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
1113 *dest = '\0'; 1146 *dest = '\0';
1114 return len; 1147 return len;
1115} 1148}
1149EXPORT_SYMBOL(qword_get);
1116 1150
1117 1151
1118/* 1152/*
@@ -1244,18 +1278,18 @@ static ssize_t read_flush(struct file *file, char __user *buf,
1244 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; 1278 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data;
1245 char tbuf[20]; 1279 char tbuf[20];
1246 unsigned long p = *ppos; 1280 unsigned long p = *ppos;
1247 int len; 1281 size_t len;
1248 1282
1249 sprintf(tbuf, "%lu\n", cd->flush_time); 1283 sprintf(tbuf, "%lu\n", cd->flush_time);
1250 len = strlen(tbuf); 1284 len = strlen(tbuf);
1251 if (p >= len) 1285 if (p >= len)
1252 return 0; 1286 return 0;
1253 len -= p; 1287 len -= p;
1254 if (len > count) len = count; 1288 if (len > count)
1289 len = count;
1255 if (copy_to_user(buf, (void*)(tbuf+p), len)) 1290 if (copy_to_user(buf, (void*)(tbuf+p), len))
1256 len = -EFAULT; 1291 return -EFAULT;
1257 else 1292 *ppos += len;
1258 *ppos += len;
1259 return len; 1293 return len;
1260} 1294}
1261 1295
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 74df2d358e61..5a16875f5ac8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -33,7 +33,7 @@ struct proc_dir_entry *proc_net_rpc = NULL;
33static int rpc_proc_show(struct seq_file *seq, void *v) { 33static int rpc_proc_show(struct seq_file *seq, void *v) {
34 const struct rpc_stat *statp = seq->private; 34 const struct rpc_stat *statp = seq->private;
35 const struct rpc_program *prog = statp->program; 35 const struct rpc_program *prog = statp->program;
36 int i, j; 36 unsigned int i, j;
37 37
38 seq_printf(seq, 38 seq_printf(seq,
39 "net %u %u %u %u\n", 39 "net %u %u %u %u\n",
@@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
81 const struct svc_program *prog = statp->program; 81 const struct svc_program *prog = statp->program;
82 const struct svc_procedure *proc; 82 const struct svc_procedure *proc;
83 const struct svc_version *vers; 83 const struct svc_version *vers;
84 int i, j; 84 unsigned int i, j;
85 85
86 seq_printf(seq, 86 seq_printf(seq,
87 "net %u %u %u %u\n", 87 "net %u %u %u %u\n",
@@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
106 seq_putc(seq, '\n'); 106 seq_putc(seq, '\n');
107 } 107 }
108} 108}
109EXPORT_SYMBOL(svc_seq_show);
109 110
110/** 111/**
111 * rpc_alloc_iostats - allocate an rpc_iostats structure 112 * rpc_alloc_iostats - allocate an rpc_iostats structure
@@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
255{ 256{
256 return do_register(statp->program->pg_name, statp, fops); 257 return do_register(statp->program->pg_name, statp, fops);
257} 258}
259EXPORT_SYMBOL(svc_proc_register);
258 260
259void 261void
260svc_proc_unregister(const char *name) 262svc_proc_unregister(const char *name)
261{ 263{
262 remove_proc_entry(name, proc_net_rpc); 264 remove_proc_entry(name, proc_net_rpc);
263} 265}
266EXPORT_SYMBOL(svc_proc_unregister);
264 267
265void 268void
266rpc_proc_init(void) 269rpc_proc_init(void)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1a7e309d008b..843629f55763 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,48 +22,6 @@
22#include <linux/sunrpc/rpc_pipe_fs.h> 22#include <linux/sunrpc/rpc_pipe_fs.h>
23#include <linux/sunrpc/xprtsock.h> 23#include <linux/sunrpc/xprtsock.h>
24 24
25/* RPC server stuff */
26EXPORT_SYMBOL(svc_create);
27EXPORT_SYMBOL(svc_create_thread);
28EXPORT_SYMBOL(svc_create_pooled);
29EXPORT_SYMBOL(svc_set_num_threads);
30EXPORT_SYMBOL(svc_exit_thread);
31EXPORT_SYMBOL(svc_destroy);
32EXPORT_SYMBOL(svc_drop);
33EXPORT_SYMBOL(svc_process);
34EXPORT_SYMBOL(svc_recv);
35EXPORT_SYMBOL(svc_wake_up);
36EXPORT_SYMBOL(svc_makesock);
37EXPORT_SYMBOL(svc_reserve);
38EXPORT_SYMBOL(svc_auth_register);
39EXPORT_SYMBOL(auth_domain_lookup);
40EXPORT_SYMBOL(svc_authenticate);
41EXPORT_SYMBOL(svc_set_client);
42
43/* RPC statistics */
44#ifdef CONFIG_PROC_FS
45EXPORT_SYMBOL(svc_proc_register);
46EXPORT_SYMBOL(svc_proc_unregister);
47EXPORT_SYMBOL(svc_seq_show);
48#endif
49
50/* caching... */
51EXPORT_SYMBOL(auth_domain_find);
52EXPORT_SYMBOL(auth_domain_put);
53EXPORT_SYMBOL(auth_unix_add_addr);
54EXPORT_SYMBOL(auth_unix_forget_old);
55EXPORT_SYMBOL(auth_unix_lookup);
56EXPORT_SYMBOL(cache_check);
57EXPORT_SYMBOL(cache_flush);
58EXPORT_SYMBOL(cache_purge);
59EXPORT_SYMBOL(cache_register);
60EXPORT_SYMBOL(cache_unregister);
61EXPORT_SYMBOL(qword_add);
62EXPORT_SYMBOL(qword_addhex);
63EXPORT_SYMBOL(qword_get);
64EXPORT_SYMBOL(svcauth_unix_purge);
65EXPORT_SYMBOL(unix_domain_find);
66
67extern struct cache_detail ip_map_cache, unix_gid_cache; 25extern struct cache_detail ip_map_cache, unix_gid_cache;
68 26
69static int __init 27static int __init
@@ -85,7 +43,8 @@ init_sunrpc(void)
85#endif 43#endif
86 cache_register(&ip_map_cache); 44 cache_register(&ip_map_cache);
87 cache_register(&unix_gid_cache); 45 cache_register(&unix_gid_cache);
88 init_socket_xprt(); 46 svc_init_xprt_sock(); /* svc sock transport */
47 init_socket_xprt(); /* clnt sock transport */
89 rpcauth_init_module(); 48 rpcauth_init_module();
90out: 49out:
91 return err; 50 return err;
@@ -96,12 +55,11 @@ cleanup_sunrpc(void)
96{ 55{
97 rpcauth_remove_module(); 56 rpcauth_remove_module();
98 cleanup_socket_xprt(); 57 cleanup_socket_xprt();
58 svc_cleanup_xprt_sock();
99 unregister_rpc_pipefs(); 59 unregister_rpc_pipefs();
100 rpc_destroy_mempool(); 60 rpc_destroy_mempool();
101 if (cache_unregister(&ip_map_cache)) 61 cache_unregister(&ip_map_cache);
102 printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); 62 cache_unregister(&unix_gid_cache);
103 if (cache_unregister(&unix_gid_cache))
104 printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
105#ifdef RPC_DEBUG 63#ifdef RPC_DEBUG
106 rpc_unregister_sysctl(); 64 rpc_unregister_sysctl();
107#endif 65#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4ad5fbbb18b4..a290e1523297 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
364 void (*shutdown)(struct svc_serv *serv)) 364 void (*shutdown)(struct svc_serv *serv))
365{ 365{
366 struct svc_serv *serv; 366 struct svc_serv *serv;
367 int vers; 367 unsigned int vers;
368 unsigned int xdrsize; 368 unsigned int xdrsize;
369 unsigned int i; 369 unsigned int i;
370 370
@@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize,
433{ 433{
434 return __svc_create(prog, bufsize, /*npools*/1, shutdown); 434 return __svc_create(prog, bufsize, /*npools*/1, shutdown);
435} 435}
436EXPORT_SYMBOL(svc_create);
436 437
437struct svc_serv * 438struct svc_serv *
438svc_create_pooled(struct svc_program *prog, unsigned int bufsize, 439svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
@@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
452 453
453 return serv; 454 return serv;
454} 455}
456EXPORT_SYMBOL(svc_create_pooled);
455 457
456/* 458/*
457 * Destroy an RPC service. Should be called with the BKL held 459 * Destroy an RPC service. Should be called with the BKL held
@@ -459,9 +461,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
459void 461void
460svc_destroy(struct svc_serv *serv) 462svc_destroy(struct svc_serv *serv)
461{ 463{
462 struct svc_sock *svsk;
463 struct svc_sock *tmp;
464
465 dprintk("svc: svc_destroy(%s, %d)\n", 464 dprintk("svc: svc_destroy(%s, %d)\n",
466 serv->sv_program->pg_name, 465 serv->sv_program->pg_name,
467 serv->sv_nrthreads); 466 serv->sv_nrthreads);
@@ -476,14 +475,12 @@ svc_destroy(struct svc_serv *serv)
476 475
477 del_timer_sync(&serv->sv_temptimer); 476 del_timer_sync(&serv->sv_temptimer);
478 477
479 list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) 478 svc_close_all(&serv->sv_tempsocks);
480 svc_force_close_socket(svsk);
481 479
482 if (serv->sv_shutdown) 480 if (serv->sv_shutdown)
483 serv->sv_shutdown(serv); 481 serv->sv_shutdown(serv);
484 482
485 list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) 483 svc_close_all(&serv->sv_permsocks);
486 svc_force_close_socket(svsk);
487 484
488 BUG_ON(!list_empty(&serv->sv_permsocks)); 485 BUG_ON(!list_empty(&serv->sv_permsocks));
489 BUG_ON(!list_empty(&serv->sv_tempsocks)); 486 BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -498,6 +495,7 @@ svc_destroy(struct svc_serv *serv)
498 kfree(serv->sv_pools); 495 kfree(serv->sv_pools);
499 kfree(serv); 496 kfree(serv);
500} 497}
498EXPORT_SYMBOL(svc_destroy);
501 499
502/* 500/*
503 * Allocate an RPC server's buffer space. 501 * Allocate an RPC server's buffer space.
@@ -536,31 +534,17 @@ svc_release_buffer(struct svc_rqst *rqstp)
536 put_page(rqstp->rq_pages[i]); 534 put_page(rqstp->rq_pages[i]);
537} 535}
538 536
539/* 537struct svc_rqst *
540 * Create a thread in the given pool. Caller must hold BKL. 538svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
541 * On a NUMA or SMP machine, with a multi-pool serv, the thread
542 * will be restricted to run on the cpus belonging to the pool.
543 */
544static int
545__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
546 struct svc_pool *pool)
547{ 539{
548 struct svc_rqst *rqstp; 540 struct svc_rqst *rqstp;
549 int error = -ENOMEM;
550 int have_oldmask = 0;
551 cpumask_t oldmask;
552 541
553 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); 542 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
554 if (!rqstp) 543 if (!rqstp)
555 goto out; 544 goto out_enomem;
556 545
557 init_waitqueue_head(&rqstp->rq_wait); 546 init_waitqueue_head(&rqstp->rq_wait);
558 547
559 if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
560 || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
561 || !svc_init_buffer(rqstp, serv->sv_max_mesg))
562 goto out_thread;
563
564 serv->sv_nrthreads++; 548 serv->sv_nrthreads++;
565 spin_lock_bh(&pool->sp_lock); 549 spin_lock_bh(&pool->sp_lock);
566 pool->sp_nrthreads++; 550 pool->sp_nrthreads++;
@@ -569,6 +553,45 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
569 rqstp->rq_server = serv; 553 rqstp->rq_server = serv;
570 rqstp->rq_pool = pool; 554 rqstp->rq_pool = pool;
571 555
556 rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
557 if (!rqstp->rq_argp)
558 goto out_thread;
559
560 rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
561 if (!rqstp->rq_resp)
562 goto out_thread;
563
564 if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
565 goto out_thread;
566
567 return rqstp;
568out_thread:
569 svc_exit_thread(rqstp);
570out_enomem:
571 return ERR_PTR(-ENOMEM);
572}
573EXPORT_SYMBOL(svc_prepare_thread);
574
575/*
576 * Create a thread in the given pool. Caller must hold BKL.
577 * On a NUMA or SMP machine, with a multi-pool serv, the thread
578 * will be restricted to run on the cpus belonging to the pool.
579 */
580static int
581__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
582 struct svc_pool *pool)
583{
584 struct svc_rqst *rqstp;
585 int error = -ENOMEM;
586 int have_oldmask = 0;
587 cpumask_t oldmask;
588
589 rqstp = svc_prepare_thread(serv, pool);
590 if (IS_ERR(rqstp)) {
591 error = PTR_ERR(rqstp);
592 goto out;
593 }
594
572 if (serv->sv_nrpools > 1) 595 if (serv->sv_nrpools > 1)
573 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); 596 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
574 597
@@ -597,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
597{ 620{
598 return __svc_create_thread(func, serv, &serv->sv_pools[0]); 621 return __svc_create_thread(func, serv, &serv->sv_pools[0]);
599} 622}
623EXPORT_SYMBOL(svc_create_thread);
600 624
601/* 625/*
602 * Choose a pool in which to create a new thread, for svc_set_num_threads 626 * Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -700,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
700 724
701 return error; 725 return error;
702} 726}
727EXPORT_SYMBOL(svc_set_num_threads);
703 728
704/* 729/*
705 * Called from a server thread as it's exiting. Caller must hold BKL. 730 * Called from a server thread as it's exiting. Caller must hold BKL.
@@ -726,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
726 if (serv) 751 if (serv)
727 svc_destroy(serv); 752 svc_destroy(serv);
728} 753}
754EXPORT_SYMBOL(svc_exit_thread);
729 755
730/* 756/*
731 * Register an RPC service with the local portmapper. 757 * Register an RPC service with the local portmapper.
@@ -737,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port)
737{ 763{
738 struct svc_program *progp; 764 struct svc_program *progp;
739 unsigned long flags; 765 unsigned long flags;
740 int i, error = 0, dummy; 766 unsigned int i;
767 int error = 0, dummy;
741 768
742 if (!port) 769 if (!port)
743 clear_thread_flag(TIF_SIGPENDING); 770 clear_thread_flag(TIF_SIGPENDING);
@@ -840,9 +867,9 @@ svc_process(struct svc_rqst *rqstp)
840 rqstp->rq_res.tail[0].iov_len = 0; 867 rqstp->rq_res.tail[0].iov_len = 0;
841 /* Will be turned off only in gss privacy case: */ 868 /* Will be turned off only in gss privacy case: */
842 rqstp->rq_splice_ok = 1; 869 rqstp->rq_splice_ok = 1;
843 /* tcp needs a space for the record length... */ 870
844 if (rqstp->rq_prot == IPPROTO_TCP) 871 /* Setup reply header */
845 svc_putnl(resv, 0); 872 rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
846 873
847 rqstp->rq_xid = svc_getu32(argv); 874 rqstp->rq_xid = svc_getu32(argv);
848 svc_putu32(resv, rqstp->rq_xid); 875 svc_putu32(resv, rqstp->rq_xid);
@@ -1049,16 +1076,15 @@ err_bad:
1049 svc_putnl(resv, ntohl(rpc_stat)); 1076 svc_putnl(resv, ntohl(rpc_stat));
1050 goto sendit; 1077 goto sendit;
1051} 1078}
1079EXPORT_SYMBOL(svc_process);
1052 1080
1053/* 1081/*
1054 * Return (transport-specific) limit on the rpc payload. 1082 * Return (transport-specific) limit on the rpc payload.
1055 */ 1083 */
1056u32 svc_max_payload(const struct svc_rqst *rqstp) 1084u32 svc_max_payload(const struct svc_rqst *rqstp)
1057{ 1085{
1058 int max = RPCSVC_MAXPAYLOAD_TCP; 1086 u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
1059 1087
1060 if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
1061 max = RPCSVC_MAXPAYLOAD_UDP;
1062 if (rqstp->rq_server->sv_max_payload < max) 1088 if (rqstp->rq_server->sv_max_payload < max)
1063 max = rqstp->rq_server->sv_max_payload; 1089 max = rqstp->rq_server->sv_max_payload;
1064 return max; 1090 return max;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 000000000000..ea377e06afae
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1055 @@
1/*
2 * linux/net/sunrpc/svc_xprt.c
3 *
4 * Author: Tom Tucker <tom@opengridcomputing.com>
5 */
6
7#include <linux/sched.h>
8#include <linux/errno.h>
9#include <linux/fcntl.h>
10#include <linux/net.h>
11#include <linux/in.h>
12#include <linux/inet.h>
13#include <linux/udp.h>
14#include <linux/tcp.h>
15#include <linux/unistd.h>
16#include <linux/slab.h>
17#include <linux/netdevice.h>
18#include <linux/skbuff.h>
19#include <linux/file.h>
20#include <linux/freezer.h>
21#include <net/sock.h>
22#include <net/checksum.h>
23#include <net/ip.h>
24#include <net/ipv6.h>
25#include <net/tcp_states.h>
26#include <linux/uaccess.h>
27#include <asm/ioctls.h>
28
29#include <linux/sunrpc/types.h>
30#include <linux/sunrpc/clnt.h>
31#include <linux/sunrpc/xdr.h>
32#include <linux/sunrpc/stats.h>
33#include <linux/sunrpc/svc_xprt.h>
34
35#define RPCDBG_FACILITY RPCDBG_SVCXPRT
36
37static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
38static int svc_deferred_recv(struct svc_rqst *rqstp);
39static struct cache_deferred_req *svc_defer(struct cache_req *req);
40static void svc_age_temp_xprts(unsigned long closure);
41
42/* apparently the "standard" is that clients close
43 * idle connections after 5 minutes, servers after
44 * 6 minutes
45 * http://www.connectathon.org/talks96/nfstcp.pdf
46 */
47static int svc_conn_age_period = 6*60;
48
49/* List of registered transport classes */
50static DEFINE_SPINLOCK(svc_xprt_class_lock);
51static LIST_HEAD(svc_xprt_class_list);
52
53/* SMP locking strategy:
54 *
55 * svc_pool->sp_lock protects most of the fields of that pool.
56 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
57 * when both need to be taken (rare), svc_serv->sv_lock is first.
58 * BKL protects svc_serv->sv_nrthread.
59 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
60 * and the ->sk_info_authunix cache.
61 *
62 * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
63 * enqueued multiply. During normal transport processing this bit
64 * is set by svc_xprt_enqueue and cleared by svc_xprt_received.
65 * Providers should not manipulate this bit directly.
66 *
67 * Some flags can be set to certain values at any time
68 * providing that certain rules are followed:
69 *
70 * XPT_CONN, XPT_DATA:
71 * - Can be set or cleared at any time.
72 * - After a set, svc_xprt_enqueue must be called to enqueue
73 * the transport for processing.
74 * - After a clear, the transport must be read/accepted.
75 * If this succeeds, it must be set again.
76 * XPT_CLOSE:
77 * - Can set at any time. It is never cleared.
78 * XPT_DEAD:
79 * - Can only be set while XPT_BUSY is held which ensures
80 * that no other thread will be using the transport or will
81 * try to set XPT_DEAD.
82 */
83
84int svc_reg_xprt_class(struct svc_xprt_class *xcl)
85{
86 struct svc_xprt_class *cl;
87 int res = -EEXIST;
88
89 dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
90
91 INIT_LIST_HEAD(&xcl->xcl_list);
92 spin_lock(&svc_xprt_class_lock);
93 /* Make sure there isn't already a class with the same name */
94 list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
95 if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
96 goto out;
97 }
98 list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
99 res = 0;
100out:
101 spin_unlock(&svc_xprt_class_lock);
102 return res;
103}
104EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
105
106void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
107{
108 dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
109 spin_lock(&svc_xprt_class_lock);
110 list_del_init(&xcl->xcl_list);
111 spin_unlock(&svc_xprt_class_lock);
112}
113EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
114
115/*
116 * Format the transport list for printing
117 */
118int svc_print_xprts(char *buf, int maxlen)
119{
120 struct list_head *le;
121 char tmpstr[80];
122 int len = 0;
123 buf[0] = '\0';
124
125 spin_lock(&svc_xprt_class_lock);
126 list_for_each(le, &svc_xprt_class_list) {
127 int slen;
128 struct svc_xprt_class *xcl =
129 list_entry(le, struct svc_xprt_class, xcl_list);
130
131 sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
132 slen = strlen(tmpstr);
133 if (len + slen > maxlen)
134 break;
135 len += slen;
136 strcat(buf, tmpstr);
137 }
138 spin_unlock(&svc_xprt_class_lock);
139
140 return len;
141}
142
143static void svc_xprt_free(struct kref *kref)
144{
145 struct svc_xprt *xprt =
146 container_of(kref, struct svc_xprt, xpt_ref);
147 struct module *owner = xprt->xpt_class->xcl_owner;
148 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
149 && xprt->xpt_auth_cache != NULL)
150 svcauth_unix_info_release(xprt->xpt_auth_cache);
151 xprt->xpt_ops->xpo_free(xprt);
152 module_put(owner);
153}
154
155void svc_xprt_put(struct svc_xprt *xprt)
156{
157 kref_put(&xprt->xpt_ref, svc_xprt_free);
158}
159EXPORT_SYMBOL_GPL(svc_xprt_put);
160
161/*
162 * Called by transport drivers to initialize the transport independent
163 * portion of the transport instance.
164 */
165void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
166 struct svc_serv *serv)
167{
168 memset(xprt, 0, sizeof(*xprt));
169 xprt->xpt_class = xcl;
170 xprt->xpt_ops = xcl->xcl_ops;
171 kref_init(&xprt->xpt_ref);
172 xprt->xpt_server = serv;
173 INIT_LIST_HEAD(&xprt->xpt_list);
174 INIT_LIST_HEAD(&xprt->xpt_ready);
175 INIT_LIST_HEAD(&xprt->xpt_deferred);
176 mutex_init(&xprt->xpt_mutex);
177 spin_lock_init(&xprt->xpt_lock);
178 set_bit(XPT_BUSY, &xprt->xpt_flags);
179}
180EXPORT_SYMBOL_GPL(svc_xprt_init);
181
182int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
183 int flags)
184{
185 struct svc_xprt_class *xcl;
186 struct sockaddr_in sin = {
187 .sin_family = AF_INET,
188 .sin_addr.s_addr = INADDR_ANY,
189 .sin_port = htons(port),
190 };
191 dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
192 spin_lock(&svc_xprt_class_lock);
193 list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
194 struct svc_xprt *newxprt;
195
196 if (strcmp(xprt_name, xcl->xcl_name))
197 continue;
198
199 if (!try_module_get(xcl->xcl_owner))
200 goto err;
201
202 spin_unlock(&svc_xprt_class_lock);
203 newxprt = xcl->xcl_ops->
204 xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
205 flags);
206 if (IS_ERR(newxprt)) {
207 module_put(xcl->xcl_owner);
208 return PTR_ERR(newxprt);
209 }
210
211 clear_bit(XPT_TEMP, &newxprt->xpt_flags);
212 spin_lock_bh(&serv->sv_lock);
213 list_add(&newxprt->xpt_list, &serv->sv_permsocks);
214 spin_unlock_bh(&serv->sv_lock);
215 clear_bit(XPT_BUSY, &newxprt->xpt_flags);
216 return svc_xprt_local_port(newxprt);
217 }
218 err:
219 spin_unlock(&svc_xprt_class_lock);
220 dprintk("svc: transport %s not found\n", xprt_name);
221 return -ENOENT;
222}
223EXPORT_SYMBOL_GPL(svc_create_xprt);
224
225/*
226 * Copy the local and remote xprt addresses to the rqstp structure
227 */
228void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
229{
230 struct sockaddr *sin;
231
232 memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
233 rqstp->rq_addrlen = xprt->xpt_remotelen;
234
235 /*
236 * Destination address in request is needed for binding the
237 * source address in RPC replies/callbacks later.
238 */
239 sin = (struct sockaddr *)&xprt->xpt_local;
240 switch (sin->sa_family) {
241 case AF_INET:
242 rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
243 break;
244 case AF_INET6:
245 rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
246 break;
247 }
248}
249EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
250
251/**
252 * svc_print_addr - Format rq_addr field for printing
253 * @rqstp: svc_rqst struct containing address to print
254 * @buf: target buffer for formatted address
255 * @len: length of target buffer
256 *
257 */
258char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
259{
260 return __svc_print_addr(svc_addr(rqstp), buf, len);
261}
262EXPORT_SYMBOL_GPL(svc_print_addr);
263
264/*
265 * Queue up an idle server thread. Must have pool->sp_lock held.
266 * Note: this is really a stack rather than a queue, so that we only
267 * use as many different threads as we need, and the rest don't pollute
268 * the cache.
269 */
270static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
271{
272 list_add(&rqstp->rq_list, &pool->sp_threads);
273}
274
275/*
276 * Dequeue an nfsd thread. Must have pool->sp_lock held.
277 */
278static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
279{
280 list_del(&rqstp->rq_list);
281}
282
283/*
284 * Queue up a transport with data pending. If there are idle nfsd
285 * processes, wake 'em up.
286 *
287 */
288void svc_xprt_enqueue(struct svc_xprt *xprt)
289{
290 struct svc_serv *serv = xprt->xpt_server;
291 struct svc_pool *pool;
292 struct svc_rqst *rqstp;
293 int cpu;
294
295 if (!(xprt->xpt_flags &
296 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
297 return;
298 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
299 return;
300
301 cpu = get_cpu();
302 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
303 put_cpu();
304
305 spin_lock_bh(&pool->sp_lock);
306
307 if (!list_empty(&pool->sp_threads) &&
308 !list_empty(&pool->sp_sockets))
309 printk(KERN_ERR
310 "svc_xprt_enqueue: "
311 "threads and transports both waiting??\n");
312
313 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
314 /* Don't enqueue dead transports */
315 dprintk("svc: transport %p is dead, not enqueued\n", xprt);
316 goto out_unlock;
317 }
318
319 /* Mark transport as busy. It will remain in this state until
320 * the provider calls svc_xprt_received. We update XPT_BUSY
321 * atomically because it also guards against trying to enqueue
322 * the transport twice.
323 */
324 if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
325 /* Don't enqueue transport while already enqueued */
326 dprintk("svc: transport %p busy, not enqueued\n", xprt);
327 goto out_unlock;
328 }
329 BUG_ON(xprt->xpt_pool != NULL);
330 xprt->xpt_pool = pool;
331
332 /* Handle pending connection */
333 if (test_bit(XPT_CONN, &xprt->xpt_flags))
334 goto process;
335
336 /* Handle close in-progress */
337 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
338 goto process;
339
340 /* Check if we have space to reply to a request */
341 if (!xprt->xpt_ops->xpo_has_wspace(xprt)) {
342 /* Don't enqueue while not enough space for reply */
343 dprintk("svc: no write space, transport %p not enqueued\n",
344 xprt);
345 xprt->xpt_pool = NULL;
346 clear_bit(XPT_BUSY, &xprt->xpt_flags);
347 goto out_unlock;
348 }
349
350 process:
351 if (!list_empty(&pool->sp_threads)) {
352 rqstp = list_entry(pool->sp_threads.next,
353 struct svc_rqst,
354 rq_list);
355 dprintk("svc: transport %p served by daemon %p\n",
356 xprt, rqstp);
357 svc_thread_dequeue(pool, rqstp);
358 if (rqstp->rq_xprt)
359 printk(KERN_ERR
360 "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
361 rqstp, rqstp->rq_xprt);
362 rqstp->rq_xprt = xprt;
363 svc_xprt_get(xprt);
364 rqstp->rq_reserved = serv->sv_max_mesg;
365 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
366 BUG_ON(xprt->xpt_pool != pool);
367 wake_up(&rqstp->rq_wait);
368 } else {
369 dprintk("svc: transport %p put into queue\n", xprt);
370 list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
371 BUG_ON(xprt->xpt_pool != pool);
372 }
373
374out_unlock:
375 spin_unlock_bh(&pool->sp_lock);
376}
377EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
378
379/*
380 * Dequeue the first transport. Must be called with the pool->sp_lock held.
381 */
382static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
383{
384 struct svc_xprt *xprt;
385
386 if (list_empty(&pool->sp_sockets))
387 return NULL;
388
389 xprt = list_entry(pool->sp_sockets.next,
390 struct svc_xprt, xpt_ready);
391 list_del_init(&xprt->xpt_ready);
392
393 dprintk("svc: transport %p dequeued, inuse=%d\n",
394 xprt, atomic_read(&xprt->xpt_ref.refcount));
395
396 return xprt;
397}
398
399/*
400 * svc_xprt_received conditionally queues the transport for processing
401 * by another thread. The caller must hold the XPT_BUSY bit and must
402 * not thereafter touch transport data.
403 *
404 * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
405 * insufficient) data.
406 */
407void svc_xprt_received(struct svc_xprt *xprt)
408{
409 BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
410 xprt->xpt_pool = NULL;
411 clear_bit(XPT_BUSY, &xprt->xpt_flags);
412 svc_xprt_enqueue(xprt);
413}
414EXPORT_SYMBOL_GPL(svc_xprt_received);
415
416/**
417 * svc_reserve - change the space reserved for the reply to a request.
418 * @rqstp: The request in question
419 * @space: new max space to reserve
420 *
421 * Each request reserves some space on the output queue of the transport
422 * to make sure the reply fits. This function reduces that reserved
423 * space to be the amount of space used already, plus @space.
424 *
425 */
426void svc_reserve(struct svc_rqst *rqstp, int space)
427{
428 space += rqstp->rq_res.head[0].iov_len;
429
430 if (space < rqstp->rq_reserved) {
431 struct svc_xprt *xprt = rqstp->rq_xprt;
432 atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
433 rqstp->rq_reserved = space;
434
435 svc_xprt_enqueue(xprt);
436 }
437}
438EXPORT_SYMBOL(svc_reserve);
439
440static void svc_xprt_release(struct svc_rqst *rqstp)
441{
442 struct svc_xprt *xprt = rqstp->rq_xprt;
443
444 rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
445
446 svc_free_res_pages(rqstp);
447 rqstp->rq_res.page_len = 0;
448 rqstp->rq_res.page_base = 0;
449
450 /* Reset response buffer and release
451 * the reservation.
452 * But first, check that enough space was reserved
453 * for the reply, otherwise we have a bug!
454 */
455 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
456 printk(KERN_ERR "RPC request reserved %d but used %d\n",
457 rqstp->rq_reserved,
458 rqstp->rq_res.len);
459
460 rqstp->rq_res.head[0].iov_len = 0;
461 svc_reserve(rqstp, 0);
462 rqstp->rq_xprt = NULL;
463
464 svc_xprt_put(xprt);
465}
466
467/*
468 * External function to wake up a server waiting for data
469 * This really only makes sense for services like lockd
470 * which have exactly one thread anyway.
471 */
472void svc_wake_up(struct svc_serv *serv)
473{
474 struct svc_rqst *rqstp;
475 unsigned int i;
476 struct svc_pool *pool;
477
478 for (i = 0; i < serv->sv_nrpools; i++) {
479 pool = &serv->sv_pools[i];
480
481 spin_lock_bh(&pool->sp_lock);
482 if (!list_empty(&pool->sp_threads)) {
483 rqstp = list_entry(pool->sp_threads.next,
484 struct svc_rqst,
485 rq_list);
486 dprintk("svc: daemon %p woken up.\n", rqstp);
487 /*
488 svc_thread_dequeue(pool, rqstp);
489 rqstp->rq_xprt = NULL;
490 */
491 wake_up(&rqstp->rq_wait);
492 }
493 spin_unlock_bh(&pool->sp_lock);
494 }
495}
496EXPORT_SYMBOL(svc_wake_up);
497
498int svc_port_is_privileged(struct sockaddr *sin)
499{
500 switch (sin->sa_family) {
501 case AF_INET:
502 return ntohs(((struct sockaddr_in *)sin)->sin_port)
503 < PROT_SOCK;
504 case AF_INET6:
505 return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
506 < PROT_SOCK;
507 default:
508 return 0;
509 }
510}
511
512/*
513 * Make sure that we don't have too many active connections. If we
514 * have, something must be dropped.
515 *
516 * There's no point in trying to do random drop here for DoS
517 * prevention. The NFS clients does 1 reconnect in 15 seconds. An
518 * attacker can easily beat that.
519 *
520 * The only somewhat efficient mechanism would be if drop old
521 * connections from the same IP first. But right now we don't even
522 * record the client IP in svc_sock.
523 */
524static void svc_check_conn_limits(struct svc_serv *serv)
525{
526 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
527 struct svc_xprt *xprt = NULL;
528 spin_lock_bh(&serv->sv_lock);
529 if (!list_empty(&serv->sv_tempsocks)) {
530 if (net_ratelimit()) {
531 /* Try to help the admin */
532 printk(KERN_NOTICE "%s: too many open "
533 "connections, consider increasing the "
534 "number of nfsd threads\n",
535 serv->sv_name);
536 }
537 /*
538 * Always select the oldest connection. It's not fair,
539 * but so is life
540 */
541 xprt = list_entry(serv->sv_tempsocks.prev,
542 struct svc_xprt,
543 xpt_list);
544 set_bit(XPT_CLOSE, &xprt->xpt_flags);
545 svc_xprt_get(xprt);
546 }
547 spin_unlock_bh(&serv->sv_lock);
548
549 if (xprt) {
550 svc_xprt_enqueue(xprt);
551 svc_xprt_put(xprt);
552 }
553 }
554}
555
556/*
557 * Receive the next request on any transport. This code is carefully
558 * organised not to touch any cachelines in the shared svc_serv
559 * structure, only cachelines in the local svc_pool.
560 */
561int svc_recv(struct svc_rqst *rqstp, long timeout)
562{
563 struct svc_xprt *xprt = NULL;
564 struct svc_serv *serv = rqstp->rq_server;
565 struct svc_pool *pool = rqstp->rq_pool;
566 int len, i;
567 int pages;
568 struct xdr_buf *arg;
569 DECLARE_WAITQUEUE(wait, current);
570
571 dprintk("svc: server %p waiting for data (to = %ld)\n",
572 rqstp, timeout);
573
574 if (rqstp->rq_xprt)
575 printk(KERN_ERR
576 "svc_recv: service %p, transport not NULL!\n",
577 rqstp);
578 if (waitqueue_active(&rqstp->rq_wait))
579 printk(KERN_ERR
580 "svc_recv: service %p, wait queue active!\n",
581 rqstp);
582
583 /* now allocate needed pages. If we get a failure, sleep briefly */
584 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
585 for (i = 0; i < pages ; i++)
586 while (rqstp->rq_pages[i] == NULL) {
587 struct page *p = alloc_page(GFP_KERNEL);
588 if (!p) {
589 int j = msecs_to_jiffies(500);
590 schedule_timeout_uninterruptible(j);
591 }
592 rqstp->rq_pages[i] = p;
593 }
594 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
595 BUG_ON(pages >= RPCSVC_MAXPAGES);
596
597 /* Make arg->head point to first page and arg->pages point to rest */
598 arg = &rqstp->rq_arg;
599 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
600 arg->head[0].iov_len = PAGE_SIZE;
601 arg->pages = rqstp->rq_pages + 1;
602 arg->page_base = 0;
603 /* save at least one page for response */
604 arg->page_len = (pages-2)*PAGE_SIZE;
605 arg->len = (pages-1)*PAGE_SIZE;
606 arg->tail[0].iov_len = 0;
607
608 try_to_freeze();
609 cond_resched();
610 if (signalled())
611 return -EINTR;
612
613 spin_lock_bh(&pool->sp_lock);
614 xprt = svc_xprt_dequeue(pool);
615 if (xprt) {
616 rqstp->rq_xprt = xprt;
617 svc_xprt_get(xprt);
618 rqstp->rq_reserved = serv->sv_max_mesg;
619 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
620 } else {
621 /* No data pending. Go to sleep */
622 svc_thread_enqueue(pool, rqstp);
623
624 /*
625 * We have to be able to interrupt this wait
626 * to bring down the daemons ...
627 */
628 set_current_state(TASK_INTERRUPTIBLE);
629 add_wait_queue(&rqstp->rq_wait, &wait);
630 spin_unlock_bh(&pool->sp_lock);
631
632 schedule_timeout(timeout);
633
634 try_to_freeze();
635
636 spin_lock_bh(&pool->sp_lock);
637 remove_wait_queue(&rqstp->rq_wait, &wait);
638
639 xprt = rqstp->rq_xprt;
640 if (!xprt) {
641 svc_thread_dequeue(pool, rqstp);
642 spin_unlock_bh(&pool->sp_lock);
643 dprintk("svc: server %p, no data yet\n", rqstp);
644 return signalled()? -EINTR : -EAGAIN;
645 }
646 }
647 spin_unlock_bh(&pool->sp_lock);
648
649 len = 0;
650 if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
651 dprintk("svc_recv: found XPT_CLOSE\n");
652 svc_delete_xprt(xprt);
653 } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
654 struct svc_xprt *newxpt;
655 newxpt = xprt->xpt_ops->xpo_accept(xprt);
656 if (newxpt) {
657 /*
658 * We know this module_get will succeed because the
659 * listener holds a reference too
660 */
661 __module_get(newxpt->xpt_class->xcl_owner);
662 svc_check_conn_limits(xprt->xpt_server);
663 spin_lock_bh(&serv->sv_lock);
664 set_bit(XPT_TEMP, &newxpt->xpt_flags);
665 list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
666 serv->sv_tmpcnt++;
667 if (serv->sv_temptimer.function == NULL) {
668 /* setup timer to age temp transports */
669 setup_timer(&serv->sv_temptimer,
670 svc_age_temp_xprts,
671 (unsigned long)serv);
672 mod_timer(&serv->sv_temptimer,
673 jiffies + svc_conn_age_period * HZ);
674 }
675 spin_unlock_bh(&serv->sv_lock);
676 svc_xprt_received(newxpt);
677 }
678 svc_xprt_received(xprt);
679 } else {
680 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
681 rqstp, pool->sp_id, xprt,
682 atomic_read(&xprt->xpt_ref.refcount));
683 rqstp->rq_deferred = svc_deferred_dequeue(xprt);
684 if (rqstp->rq_deferred) {
685 svc_xprt_received(xprt);
686 len = svc_deferred_recv(rqstp);
687 } else
688 len = xprt->xpt_ops->xpo_recvfrom(rqstp);
689 dprintk("svc: got len=%d\n", len);
690 }
691
692 /* No data, incomplete (TCP) read, or accept() */
693 if (len == 0 || len == -EAGAIN) {
694 rqstp->rq_res.len = 0;
695 svc_xprt_release(rqstp);
696 return -EAGAIN;
697 }
698 clear_bit(XPT_OLD, &xprt->xpt_flags);
699
700 rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
701 rqstp->rq_chandle.defer = svc_defer;
702
703 if (serv->sv_stats)
704 serv->sv_stats->netcnt++;
705 return len;
706}
707EXPORT_SYMBOL(svc_recv);
708
709/*
710 * Drop request
711 */
712void svc_drop(struct svc_rqst *rqstp)
713{
714 dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
715 svc_xprt_release(rqstp);
716}
717EXPORT_SYMBOL(svc_drop);
718
719/*
720 * Return reply to client.
721 */
722int svc_send(struct svc_rqst *rqstp)
723{
724 struct svc_xprt *xprt;
725 int len;
726 struct xdr_buf *xb;
727
728 xprt = rqstp->rq_xprt;
729 if (!xprt)
730 return -EFAULT;
731
732 /* release the receive skb before sending the reply */
733 rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
734
735 /* calculate over-all length */
736 xb = &rqstp->rq_res;
737 xb->len = xb->head[0].iov_len +
738 xb->page_len +
739 xb->tail[0].iov_len;
740
741 /* Grab mutex to serialize outgoing data. */
742 mutex_lock(&xprt->xpt_mutex);
743 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
744 len = -ENOTCONN;
745 else
746 len = xprt->xpt_ops->xpo_sendto(rqstp);
747 mutex_unlock(&xprt->xpt_mutex);
748 svc_xprt_release(rqstp);
749
750 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
751 return 0;
752 return len;
753}
754
755/*
756 * Timer function to close old temporary transports, using
757 * a mark-and-sweep algorithm.
758 */
759static void svc_age_temp_xprts(unsigned long closure)
760{
761 struct svc_serv *serv = (struct svc_serv *)closure;
762 struct svc_xprt *xprt;
763 struct list_head *le, *next;
764 LIST_HEAD(to_be_aged);
765
766 dprintk("svc_age_temp_xprts\n");
767
768 if (!spin_trylock_bh(&serv->sv_lock)) {
769 /* busy, try again 1 sec later */
770 dprintk("svc_age_temp_xprts: busy\n");
771 mod_timer(&serv->sv_temptimer, jiffies + HZ);
772 return;
773 }
774
775 list_for_each_safe(le, next, &serv->sv_tempsocks) {
776 xprt = list_entry(le, struct svc_xprt, xpt_list);
777
778 /* First time through, just mark it OLD. Second time
779 * through, close it. */
780 if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
781 continue;
782 if (atomic_read(&xprt->xpt_ref.refcount) > 1
783 || test_bit(XPT_BUSY, &xprt->xpt_flags))
784 continue;
785 svc_xprt_get(xprt);
786 list_move(le, &to_be_aged);
787 set_bit(XPT_CLOSE, &xprt->xpt_flags);
788 set_bit(XPT_DETACHED, &xprt->xpt_flags);
789 }
790 spin_unlock_bh(&serv->sv_lock);
791
792 while (!list_empty(&to_be_aged)) {
793 le = to_be_aged.next;
794 /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
795 list_del_init(le);
796 xprt = list_entry(le, struct svc_xprt, xpt_list);
797
798 dprintk("queuing xprt %p for closing\n", xprt);
799
800 /* a thread will dequeue and close it soon */
801 svc_xprt_enqueue(xprt);
802 svc_xprt_put(xprt);
803 }
804
805 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
806}
807
808/*
809 * Remove a dead transport
810 */
811void svc_delete_xprt(struct svc_xprt *xprt)
812{
813 struct svc_serv *serv = xprt->xpt_server;
814
815 dprintk("svc: svc_delete_xprt(%p)\n", xprt);
816 xprt->xpt_ops->xpo_detach(xprt);
817
818 spin_lock_bh(&serv->sv_lock);
819 if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
820 list_del_init(&xprt->xpt_list);
821 /*
822 * We used to delete the transport from whichever list
823 * it's sk_xprt.xpt_ready node was on, but we don't actually
824 * need to. This is because the only time we're called
825 * while still attached to a queue, the queue itself
826 * is about to be destroyed (in svc_destroy).
827 */
828 if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
829 BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
830 if (test_bit(XPT_TEMP, &xprt->xpt_flags))
831 serv->sv_tmpcnt--;
832 svc_xprt_put(xprt);
833 }
834 spin_unlock_bh(&serv->sv_lock);
835}
836
837void svc_close_xprt(struct svc_xprt *xprt)
838{
839 set_bit(XPT_CLOSE, &xprt->xpt_flags);
840 if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
841 /* someone else will have to effect the close */
842 return;
843
844 svc_xprt_get(xprt);
845 svc_delete_xprt(xprt);
846 clear_bit(XPT_BUSY, &xprt->xpt_flags);
847 svc_xprt_put(xprt);
848}
849EXPORT_SYMBOL_GPL(svc_close_xprt);
850
851void svc_close_all(struct list_head *xprt_list)
852{
853 struct svc_xprt *xprt;
854 struct svc_xprt *tmp;
855
856 list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
857 set_bit(XPT_CLOSE, &xprt->xpt_flags);
858 if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
859 /* Waiting to be processed, but no threads left,
860 * So just remove it from the waiting list
861 */
862 list_del_init(&xprt->xpt_ready);
863 clear_bit(XPT_BUSY, &xprt->xpt_flags);
864 }
865 svc_close_xprt(xprt);
866 }
867}
868
869/*
870 * Handle defer and revisit of requests
871 */
872
873static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
874{
875 struct svc_deferred_req *dr =
876 container_of(dreq, struct svc_deferred_req, handle);
877 struct svc_xprt *xprt = dr->xprt;
878
879 if (too_many) {
880 svc_xprt_put(xprt);
881 kfree(dr);
882 return;
883 }
884 dprintk("revisit queued\n");
885 dr->xprt = NULL;
886 spin_lock(&xprt->xpt_lock);
887 list_add(&dr->handle.recent, &xprt->xpt_deferred);
888 spin_unlock(&xprt->xpt_lock);
889 set_bit(XPT_DEFERRED, &xprt->xpt_flags);
890 svc_xprt_enqueue(xprt);
891 svc_xprt_put(xprt);
892}
893
894/*
895 * Save the request off for later processing. The request buffer looks
896 * like this:
897 *
898 * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
899 *
900 * This code can only handle requests that consist of an xprt-header
901 * and rpc-header.
902 */
903static struct cache_deferred_req *svc_defer(struct cache_req *req)
904{
905 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
906 struct svc_deferred_req *dr;
907
908 if (rqstp->rq_arg.page_len)
909 return NULL; /* if more than a page, give up FIXME */
910 if (rqstp->rq_deferred) {
911 dr = rqstp->rq_deferred;
912 rqstp->rq_deferred = NULL;
913 } else {
914 size_t skip;
915 size_t size;
916 /* FIXME maybe discard if size too large */
917 size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
918 dr = kmalloc(size, GFP_KERNEL);
919 if (dr == NULL)
920 return NULL;
921
922 dr->handle.owner = rqstp->rq_server;
923 dr->prot = rqstp->rq_prot;
924 memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
925 dr->addrlen = rqstp->rq_addrlen;
926 dr->daddr = rqstp->rq_daddr;
927 dr->argslen = rqstp->rq_arg.len >> 2;
928 dr->xprt_hlen = rqstp->rq_xprt_hlen;
929
930 /* back up head to the start of the buffer and copy */
931 skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
932 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
933 dr->argslen << 2);
934 }
935 svc_xprt_get(rqstp->rq_xprt);
936 dr->xprt = rqstp->rq_xprt;
937
938 dr->handle.revisit = svc_revisit;
939 return &dr->handle;
940}
941
942/*
943 * recv data from a deferred request into an active one
944 */
945static int svc_deferred_recv(struct svc_rqst *rqstp)
946{
947 struct svc_deferred_req *dr = rqstp->rq_deferred;
948
949 /* setup iov_base past transport header */
950 rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
951 /* The iov_len does not include the transport header bytes */
952 rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
953 rqstp->rq_arg.page_len = 0;
954 /* The rq_arg.len includes the transport header bytes */
955 rqstp->rq_arg.len = dr->argslen<<2;
956 rqstp->rq_prot = dr->prot;
957 memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
958 rqstp->rq_addrlen = dr->addrlen;
959 /* Save off transport header len in case we get deferred again */
960 rqstp->rq_xprt_hlen = dr->xprt_hlen;
961 rqstp->rq_daddr = dr->daddr;
962 rqstp->rq_respages = rqstp->rq_pages;
963 return (dr->argslen<<2) - dr->xprt_hlen;
964}
965
966
967static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
968{
969 struct svc_deferred_req *dr = NULL;
970
971 if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
972 return NULL;
973 spin_lock(&xprt->xpt_lock);
974 clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
975 if (!list_empty(&xprt->xpt_deferred)) {
976 dr = list_entry(xprt->xpt_deferred.next,
977 struct svc_deferred_req,
978 handle.recent);
979 list_del_init(&dr->handle.recent);
980 set_bit(XPT_DEFERRED, &xprt->xpt_flags);
981 }
982 spin_unlock(&xprt->xpt_lock);
983 return dr;
984}
985
986/*
987 * Return the transport instance pointer for the endpoint accepting
988 * connections/peer traffic from the specified transport class,
989 * address family and port.
990 *
991 * Specifying 0 for the address family or port is effectively a
992 * wild-card, and will result in matching the first transport in the
993 * service's list that has a matching class name.
994 */
995struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
996 int af, int port)
997{
998 struct svc_xprt *xprt;
999 struct svc_xprt *found = NULL;
1000
1001 /* Sanity check the args */
1002 if (!serv || !xcl_name)
1003 return found;
1004
1005 spin_lock_bh(&serv->sv_lock);
1006 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
1007 if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
1008 continue;
1009 if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
1010 continue;
1011 if (port && port != svc_xprt_local_port(xprt))
1012 continue;
1013 found = xprt;
1014 svc_xprt_get(xprt);
1015 break;
1016 }
1017 spin_unlock_bh(&serv->sv_lock);
1018 return found;
1019}
1020EXPORT_SYMBOL_GPL(svc_find_xprt);
1021
1022/*
1023 * Format a buffer with a list of the active transports. A zero for
1024 * the buflen parameter disables target buffer overflow checking.
1025 */
1026int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
1027{
1028 struct svc_xprt *xprt;
1029 char xprt_str[64];
1030 int totlen = 0;
1031 int len;
1032
1033 /* Sanity check args */
1034 if (!serv)
1035 return 0;
1036
1037 spin_lock_bh(&serv->sv_lock);
1038 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
1039 len = snprintf(xprt_str, sizeof(xprt_str),
1040 "%s %d\n", xprt->xpt_class->xcl_name,
1041 svc_xprt_local_port(xprt));
1042 /* If the string was truncated, replace with error string */
1043 if (len >= sizeof(xprt_str))
1044 strcpy(xprt_str, "name-too-long\n");
1045 /* Don't overflow buffer */
1046 len = strlen(xprt_str);
1047 if (buflen && (len + totlen >= buflen))
1048 break;
1049 strcpy(buf+totlen, xprt_str);
1050 totlen += len;
1051 }
1052 spin_unlock_bh(&serv->sv_lock);
1053 return totlen;
1054}
1055EXPORT_SYMBOL_GPL(svc_xprt_names);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index af7c5f05c6e1..8a73cbb16052 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
57 rqstp->rq_authop = aops; 57 rqstp->rq_authop = aops;
58 return aops->accept(rqstp, authp); 58 return aops->accept(rqstp, authp);
59} 59}
60EXPORT_SYMBOL(svc_authenticate);
60 61
61int svc_set_client(struct svc_rqst *rqstp) 62int svc_set_client(struct svc_rqst *rqstp)
62{ 63{
63 return rqstp->rq_authop->set_client(rqstp); 64 return rqstp->rq_authop->set_client(rqstp);
64} 65}
66EXPORT_SYMBOL(svc_set_client);
65 67
66/* A request, which was authenticated, has now executed. 68/* A request, which was authenticated, has now executed.
67 * Time to finalise the credentials and verifier 69 * Time to finalise the credentials and verifier
@@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
93 spin_unlock(&authtab_lock); 95 spin_unlock(&authtab_lock);
94 return rv; 96 return rv;
95} 97}
98EXPORT_SYMBOL(svc_auth_register);
96 99
97void 100void
98svc_auth_unregister(rpc_authflavor_t flavor) 101svc_auth_unregister(rpc_authflavor_t flavor)
@@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom)
129 spin_unlock(&auth_domain_lock); 132 spin_unlock(&auth_domain_lock);
130 } 133 }
131} 134}
135EXPORT_SYMBOL(auth_domain_put);
132 136
133struct auth_domain * 137struct auth_domain *
134auth_domain_lookup(char *name, struct auth_domain *new) 138auth_domain_lookup(char *name, struct auth_domain *new)
@@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new)
153 spin_unlock(&auth_domain_lock); 157 spin_unlock(&auth_domain_lock);
154 return new; 158 return new;
155} 159}
160EXPORT_SYMBOL(auth_domain_lookup);
156 161
157struct auth_domain *auth_domain_find(char *name) 162struct auth_domain *auth_domain_find(char *name)
158{ 163{
159 return auth_domain_lookup(name, NULL); 164 return auth_domain_lookup(name, NULL);
160} 165}
166EXPORT_SYMBOL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 411479411b21..3c64051e4555 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name)
63 rv = auth_domain_lookup(name, &new->h); 63 rv = auth_domain_lookup(name, &new->h);
64 } 64 }
65} 65}
66EXPORT_SYMBOL(unix_domain_find);
66 67
67static void svcauth_unix_domain_release(struct auth_domain *dom) 68static void svcauth_unix_domain_release(struct auth_domain *dom)
68{ 69{
@@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
340 else 341 else
341 return -ENOMEM; 342 return -ENOMEM;
342} 343}
344EXPORT_SYMBOL(auth_unix_add_addr);
343 345
344int auth_unix_forget_old(struct auth_domain *dom) 346int auth_unix_forget_old(struct auth_domain *dom)
345{ 347{
@@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
351 udom->addr_changes++; 353 udom->addr_changes++;
352 return 0; 354 return 0;
353} 355}
356EXPORT_SYMBOL(auth_unix_forget_old);
354 357
355struct auth_domain *auth_unix_lookup(struct in_addr addr) 358struct auth_domain *auth_unix_lookup(struct in_addr addr)
356{ 359{
@@ -375,50 +378,56 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
375 cache_put(&ipm->h, &ip_map_cache); 378 cache_put(&ipm->h, &ip_map_cache);
376 return rv; 379 return rv;
377} 380}
381EXPORT_SYMBOL(auth_unix_lookup);
378 382
379void svcauth_unix_purge(void) 383void svcauth_unix_purge(void)
380{ 384{
381 cache_purge(&ip_map_cache); 385 cache_purge(&ip_map_cache);
382} 386}
387EXPORT_SYMBOL(svcauth_unix_purge);
383 388
384static inline struct ip_map * 389static inline struct ip_map *
385ip_map_cached_get(struct svc_rqst *rqstp) 390ip_map_cached_get(struct svc_rqst *rqstp)
386{ 391{
387 struct ip_map *ipm; 392 struct ip_map *ipm = NULL;
388 struct svc_sock *svsk = rqstp->rq_sock; 393 struct svc_xprt *xprt = rqstp->rq_xprt;
389 spin_lock(&svsk->sk_lock); 394
390 ipm = svsk->sk_info_authunix; 395 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
391 if (ipm != NULL) { 396 spin_lock(&xprt->xpt_lock);
392 if (!cache_valid(&ipm->h)) { 397 ipm = xprt->xpt_auth_cache;
393 /* 398 if (ipm != NULL) {
394 * The entry has been invalidated since it was 399 if (!cache_valid(&ipm->h)) {
395 * remembered, e.g. by a second mount from the 400 /*
396 * same IP address. 401 * The entry has been invalidated since it was
397 */ 402 * remembered, e.g. by a second mount from the
398 svsk->sk_info_authunix = NULL; 403 * same IP address.
399 spin_unlock(&svsk->sk_lock); 404 */
400 cache_put(&ipm->h, &ip_map_cache); 405 xprt->xpt_auth_cache = NULL;
401 return NULL; 406 spin_unlock(&xprt->xpt_lock);
407 cache_put(&ipm->h, &ip_map_cache);
408 return NULL;
409 }
410 cache_get(&ipm->h);
402 } 411 }
403 cache_get(&ipm->h); 412 spin_unlock(&xprt->xpt_lock);
404 } 413 }
405 spin_unlock(&svsk->sk_lock);
406 return ipm; 414 return ipm;
407} 415}
408 416
409static inline void 417static inline void
410ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) 418ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
411{ 419{
412 struct svc_sock *svsk = rqstp->rq_sock; 420 struct svc_xprt *xprt = rqstp->rq_xprt;
413 421
414 spin_lock(&svsk->sk_lock); 422 if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
415 if (svsk->sk_sock->type == SOCK_STREAM && 423 spin_lock(&xprt->xpt_lock);
416 svsk->sk_info_authunix == NULL) { 424 if (xprt->xpt_auth_cache == NULL) {
417 /* newly cached, keep the reference */ 425 /* newly cached, keep the reference */
418 svsk->sk_info_authunix = ipm; 426 xprt->xpt_auth_cache = ipm;
419 ipm = NULL; 427 ipm = NULL;
428 }
429 spin_unlock(&xprt->xpt_lock);
420 } 430 }
421 spin_unlock(&svsk->sk_lock);
422 if (ipm) 431 if (ipm)
423 cache_put(&ipm->h, &ip_map_cache); 432 cache_put(&ipm->h, &ip_map_cache);
424} 433}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c75bffeb89eb..1d3e5fcc2cc4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * The server scheduling algorithm does not always distribute the load 6 * The server scheduling algorithm does not always distribute the load
7 * evenly when servicing a single client. May need to modify the 7 * evenly when servicing a single client. May need to modify the
8 * svc_sock_enqueue procedure... 8 * svc_xprt_enqueue procedure...
9 * 9 *
10 * TCP support is largely untested and may be a little slow. The problem 10 * TCP support is largely untested and may be a little slow. The problem
11 * is that we currently do two separate recvfrom's, one for the 4-byte 11 * is that we currently do two separate recvfrom's, one for the 4-byte
@@ -48,72 +48,40 @@
48#include <linux/sunrpc/svcsock.h> 48#include <linux/sunrpc/svcsock.h>
49#include <linux/sunrpc/stats.h> 49#include <linux/sunrpc/stats.h>
50 50
51/* SMP locking strategy: 51#define RPCDBG_FACILITY RPCDBG_SVCXPRT
52 *
53 * svc_pool->sp_lock protects most of the fields of that pool.
54 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
55 * when both need to be taken (rare), svc_serv->sv_lock is first.
56 * BKL protects svc_serv->sv_nrthread.
57 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
58 * and the ->sk_info_authunix cache.
59 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
60 *
61 * Some flags can be set to certain values at any time
62 * providing that certain rules are followed:
63 *
64 * SK_CONN, SK_DATA, can be set or cleared at any time.
65 * after a set, svc_sock_enqueue must be called.
66 * after a clear, the socket must be read/accepted
67 * if this succeeds, it must be set again.
68 * SK_CLOSE can set at any time. It is never cleared.
69 * sk_inuse contains a bias of '1' until SK_DEAD is set.
70 * so when sk_inuse hits zero, we know the socket is dead
71 * and no-one is using it.
72 * SK_DEAD can only be set while SK_BUSY is held which ensures
73 * no other thread will be using the socket or will try to
74 * set SK_DEAD.
75 *
76 */
77
78#define RPCDBG_FACILITY RPCDBG_SVCSOCK
79 52
80 53
81static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 54static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
82 int *errp, int flags); 55 int *errp, int flags);
83static void svc_delete_socket(struct svc_sock *svsk);
84static void svc_udp_data_ready(struct sock *, int); 56static void svc_udp_data_ready(struct sock *, int);
85static int svc_udp_recvfrom(struct svc_rqst *); 57static int svc_udp_recvfrom(struct svc_rqst *);
86static int svc_udp_sendto(struct svc_rqst *); 58static int svc_udp_sendto(struct svc_rqst *);
87static void svc_close_socket(struct svc_sock *svsk); 59static void svc_sock_detach(struct svc_xprt *);
88 60static void svc_sock_free(struct svc_xprt *);
89static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
90static int svc_deferred_recv(struct svc_rqst *rqstp);
91static struct cache_deferred_req *svc_defer(struct cache_req *req);
92
93/* apparently the "standard" is that clients close
94 * idle connections after 5 minutes, servers after
95 * 6 minutes
96 * http://www.connectathon.org/talks96/nfstcp.pdf
97 */
98static int svc_conn_age_period = 6*60;
99 61
62static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
63 struct sockaddr *, int, int);
100#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
101static struct lock_class_key svc_key[2]; 65static struct lock_class_key svc_key[2];
102static struct lock_class_key svc_slock_key[2]; 66static struct lock_class_key svc_slock_key[2];
103 67
104static inline void svc_reclassify_socket(struct socket *sock) 68static void svc_reclassify_socket(struct socket *sock)
105{ 69{
106 struct sock *sk = sock->sk; 70 struct sock *sk = sock->sk;
107 BUG_ON(sock_owned_by_user(sk)); 71 BUG_ON(sock_owned_by_user(sk));
108 switch (sk->sk_family) { 72 switch (sk->sk_family) {
109 case AF_INET: 73 case AF_INET:
110 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", 74 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
111 &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); 75 &svc_slock_key[0],
76 "sk_xprt.xpt_lock-AF_INET-NFSD",
77 &svc_key[0]);
112 break; 78 break;
113 79
114 case AF_INET6: 80 case AF_INET6:
115 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", 81 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
116 &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); 82 &svc_slock_key[1],
83 "sk_xprt.xpt_lock-AF_INET6-NFSD",
84 &svc_key[1]);
117 break; 85 break;
118 86
119 default: 87 default:
@@ -121,81 +89,26 @@ static inline void svc_reclassify_socket(struct socket *sock)
121 } 89 }
122} 90}
123#else 91#else
124static inline void svc_reclassify_socket(struct socket *sock) 92static void svc_reclassify_socket(struct socket *sock)
125{ 93{
126} 94}
127#endif 95#endif
128 96
129static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len)
130{
131 switch (addr->sa_family) {
132 case AF_INET:
133 snprintf(buf, len, "%u.%u.%u.%u, port=%u",
134 NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
135 ntohs(((struct sockaddr_in *) addr)->sin_port));
136 break;
137
138 case AF_INET6:
139 snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
140 NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
141 ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
142 break;
143
144 default:
145 snprintf(buf, len, "unknown address type: %d", addr->sa_family);
146 break;
147 }
148 return buf;
149}
150
151/**
152 * svc_print_addr - Format rq_addr field for printing
153 * @rqstp: svc_rqst struct containing address to print
154 * @buf: target buffer for formatted address
155 * @len: length of target buffer
156 *
157 */
158char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
159{
160 return __svc_print_addr(svc_addr(rqstp), buf, len);
161}
162EXPORT_SYMBOL_GPL(svc_print_addr);
163
164/*
165 * Queue up an idle server thread. Must have pool->sp_lock held.
166 * Note: this is really a stack rather than a queue, so that we only
167 * use as many different threads as we need, and the rest don't pollute
168 * the cache.
169 */
170static inline void
171svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
172{
173 list_add(&rqstp->rq_list, &pool->sp_threads);
174}
175
176/*
177 * Dequeue an nfsd thread. Must have pool->sp_lock held.
178 */
179static inline void
180svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
181{
182 list_del(&rqstp->rq_list);
183}
184
185/* 97/*
186 * Release an skbuff after use 98 * Release an skbuff after use
187 */ 99 */
188static inline void 100static void svc_release_skb(struct svc_rqst *rqstp)
189svc_release_skb(struct svc_rqst *rqstp)
190{ 101{
191 struct sk_buff *skb = rqstp->rq_skbuff; 102 struct sk_buff *skb = rqstp->rq_xprt_ctxt;
192 struct svc_deferred_req *dr = rqstp->rq_deferred; 103 struct svc_deferred_req *dr = rqstp->rq_deferred;
193 104
194 if (skb) { 105 if (skb) {
195 rqstp->rq_skbuff = NULL; 106 struct svc_sock *svsk =
107 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
108 rqstp->rq_xprt_ctxt = NULL;
196 109
197 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 110 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
198 skb_free_datagram(rqstp->rq_sock->sk_sk, skb); 111 skb_free_datagram(svsk->sk_sk, skb);
199 } 112 }
200 if (dr) { 113 if (dr) {
201 rqstp->rq_deferred = NULL; 114 rqstp->rq_deferred = NULL;
@@ -203,253 +116,6 @@ svc_release_skb(struct svc_rqst *rqstp)
203 } 116 }
204} 117}
205 118
206/*
207 * Any space to write?
208 */
209static inline unsigned long
210svc_sock_wspace(struct svc_sock *svsk)
211{
212 int wspace;
213
214 if (svsk->sk_sock->type == SOCK_STREAM)
215 wspace = sk_stream_wspace(svsk->sk_sk);
216 else
217 wspace = sock_wspace(svsk->sk_sk);
218
219 return wspace;
220}
221
222/*
223 * Queue up a socket with data pending. If there are idle nfsd
224 * processes, wake 'em up.
225 *
226 */
227static void
228svc_sock_enqueue(struct svc_sock *svsk)
229{
230 struct svc_serv *serv = svsk->sk_server;
231 struct svc_pool *pool;
232 struct svc_rqst *rqstp;
233 int cpu;
234
235 if (!(svsk->sk_flags &
236 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
237 return;
238 if (test_bit(SK_DEAD, &svsk->sk_flags))
239 return;
240
241 cpu = get_cpu();
242 pool = svc_pool_for_cpu(svsk->sk_server, cpu);
243 put_cpu();
244
245 spin_lock_bh(&pool->sp_lock);
246
247 if (!list_empty(&pool->sp_threads) &&
248 !list_empty(&pool->sp_sockets))
249 printk(KERN_ERR
250 "svc_sock_enqueue: threads and sockets both waiting??\n");
251
252 if (test_bit(SK_DEAD, &svsk->sk_flags)) {
253 /* Don't enqueue dead sockets */
254 dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
255 goto out_unlock;
256 }
257
258 /* Mark socket as busy. It will remain in this state until the
259 * server has processed all pending data and put the socket back
260 * on the idle list. We update SK_BUSY atomically because
261 * it also guards against trying to enqueue the svc_sock twice.
262 */
263 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
264 /* Don't enqueue socket while already enqueued */
265 dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
266 goto out_unlock;
267 }
268 BUG_ON(svsk->sk_pool != NULL);
269 svsk->sk_pool = pool;
270
271 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
272 if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
273 > svc_sock_wspace(svsk))
274 && !test_bit(SK_CLOSE, &svsk->sk_flags)
275 && !test_bit(SK_CONN, &svsk->sk_flags)) {
276 /* Don't enqueue while not enough space for reply */
277 dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
278 svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
279 svc_sock_wspace(svsk));
280 svsk->sk_pool = NULL;
281 clear_bit(SK_BUSY, &svsk->sk_flags);
282 goto out_unlock;
283 }
284 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
285
286
287 if (!list_empty(&pool->sp_threads)) {
288 rqstp = list_entry(pool->sp_threads.next,
289 struct svc_rqst,
290 rq_list);
291 dprintk("svc: socket %p served by daemon %p\n",
292 svsk->sk_sk, rqstp);
293 svc_thread_dequeue(pool, rqstp);
294 if (rqstp->rq_sock)
295 printk(KERN_ERR
296 "svc_sock_enqueue: server %p, rq_sock=%p!\n",
297 rqstp, rqstp->rq_sock);
298 rqstp->rq_sock = svsk;
299 atomic_inc(&svsk->sk_inuse);
300 rqstp->rq_reserved = serv->sv_max_mesg;
301 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
302 BUG_ON(svsk->sk_pool != pool);
303 wake_up(&rqstp->rq_wait);
304 } else {
305 dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
306 list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
307 BUG_ON(svsk->sk_pool != pool);
308 }
309
310out_unlock:
311 spin_unlock_bh(&pool->sp_lock);
312}
313
314/*
315 * Dequeue the first socket. Must be called with the pool->sp_lock held.
316 */
317static inline struct svc_sock *
318svc_sock_dequeue(struct svc_pool *pool)
319{
320 struct svc_sock *svsk;
321
322 if (list_empty(&pool->sp_sockets))
323 return NULL;
324
325 svsk = list_entry(pool->sp_sockets.next,
326 struct svc_sock, sk_ready);
327 list_del_init(&svsk->sk_ready);
328
329 dprintk("svc: socket %p dequeued, inuse=%d\n",
330 svsk->sk_sk, atomic_read(&svsk->sk_inuse));
331
332 return svsk;
333}
334
335/*
336 * Having read something from a socket, check whether it
337 * needs to be re-enqueued.
338 * Note: SK_DATA only gets cleared when a read-attempt finds
339 * no (or insufficient) data.
340 */
341static inline void
342svc_sock_received(struct svc_sock *svsk)
343{
344 svsk->sk_pool = NULL;
345 clear_bit(SK_BUSY, &svsk->sk_flags);
346 svc_sock_enqueue(svsk);
347}
348
349
350/**
351 * svc_reserve - change the space reserved for the reply to a request.
352 * @rqstp: The request in question
353 * @space: new max space to reserve
354 *
355 * Each request reserves some space on the output queue of the socket
356 * to make sure the reply fits. This function reduces that reserved
357 * space to be the amount of space used already, plus @space.
358 *
359 */
360void svc_reserve(struct svc_rqst *rqstp, int space)
361{
362 space += rqstp->rq_res.head[0].iov_len;
363
364 if (space < rqstp->rq_reserved) {
365 struct svc_sock *svsk = rqstp->rq_sock;
366 atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
367 rqstp->rq_reserved = space;
368
369 svc_sock_enqueue(svsk);
370 }
371}
372
373/*
374 * Release a socket after use.
375 */
376static inline void
377svc_sock_put(struct svc_sock *svsk)
378{
379 if (atomic_dec_and_test(&svsk->sk_inuse)) {
380 BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
381
382 dprintk("svc: releasing dead socket\n");
383 if (svsk->sk_sock->file)
384 sockfd_put(svsk->sk_sock);
385 else
386 sock_release(svsk->sk_sock);
387 if (svsk->sk_info_authunix != NULL)
388 svcauth_unix_info_release(svsk->sk_info_authunix);
389 kfree(svsk);
390 }
391}
392
393static void
394svc_sock_release(struct svc_rqst *rqstp)
395{
396 struct svc_sock *svsk = rqstp->rq_sock;
397
398 svc_release_skb(rqstp);
399
400 svc_free_res_pages(rqstp);
401 rqstp->rq_res.page_len = 0;
402 rqstp->rq_res.page_base = 0;
403
404
405 /* Reset response buffer and release
406 * the reservation.
407 * But first, check that enough space was reserved
408 * for the reply, otherwise we have a bug!
409 */
410 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
411 printk(KERN_ERR "RPC request reserved %d but used %d\n",
412 rqstp->rq_reserved,
413 rqstp->rq_res.len);
414
415 rqstp->rq_res.head[0].iov_len = 0;
416 svc_reserve(rqstp, 0);
417 rqstp->rq_sock = NULL;
418
419 svc_sock_put(svsk);
420}
421
422/*
423 * External function to wake up a server waiting for data
424 * This really only makes sense for services like lockd
425 * which have exactly one thread anyway.
426 */
427void
428svc_wake_up(struct svc_serv *serv)
429{
430 struct svc_rqst *rqstp;
431 unsigned int i;
432 struct svc_pool *pool;
433
434 for (i = 0; i < serv->sv_nrpools; i++) {
435 pool = &serv->sv_pools[i];
436
437 spin_lock_bh(&pool->sp_lock);
438 if (!list_empty(&pool->sp_threads)) {
439 rqstp = list_entry(pool->sp_threads.next,
440 struct svc_rqst,
441 rq_list);
442 dprintk("svc: daemon %p woken up.\n", rqstp);
443 /*
444 svc_thread_dequeue(pool, rqstp);
445 rqstp->rq_sock = NULL;
446 */
447 wake_up(&rqstp->rq_wait);
448 }
449 spin_unlock_bh(&pool->sp_lock);
450 }
451}
452
453union svc_pktinfo_u { 119union svc_pktinfo_u {
454 struct in_pktinfo pkti; 120 struct in_pktinfo pkti;
455 struct in6_pktinfo pkti6; 121 struct in6_pktinfo pkti6;
@@ -459,7 +125,9 @@ union svc_pktinfo_u {
459 125
460static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) 126static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
461{ 127{
462 switch (rqstp->rq_sock->sk_sk->sk_family) { 128 struct svc_sock *svsk =
129 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
130 switch (svsk->sk_sk->sk_family) {
463 case AF_INET: { 131 case AF_INET: {
464 struct in_pktinfo *pki = CMSG_DATA(cmh); 132 struct in_pktinfo *pki = CMSG_DATA(cmh);
465 133
@@ -489,10 +157,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
489/* 157/*
490 * Generic sendto routine 158 * Generic sendto routine
491 */ 159 */
492static int 160static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
493svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
494{ 161{
495 struct svc_sock *svsk = rqstp->rq_sock; 162 struct svc_sock *svsk =
163 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
496 struct socket *sock = svsk->sk_sock; 164 struct socket *sock = svsk->sk_sock;
497 int slen; 165 int slen;
498 union { 166 union {
@@ -565,7 +233,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
565 } 233 }
566out: 234out:
567 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 235 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
568 rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, 236 svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
569 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 237 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
570 238
571 return len; 239 return len;
@@ -602,7 +270,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
602 if (!serv) 270 if (!serv)
603 return 0; 271 return 0;
604 spin_lock_bh(&serv->sv_lock); 272 spin_lock_bh(&serv->sv_lock);
605 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { 273 list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
606 int onelen = one_sock_name(buf+len, svsk); 274 int onelen = one_sock_name(buf+len, svsk);
607 if (toclose && strcmp(toclose, buf+len) == 0) 275 if (toclose && strcmp(toclose, buf+len) == 0)
608 closesk = svsk; 276 closesk = svsk;
@@ -614,7 +282,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
614 /* Should unregister with portmap, but you cannot 282 /* Should unregister with portmap, but you cannot
615 * unregister just one protocol... 283 * unregister just one protocol...
616 */ 284 */
617 svc_close_socket(closesk); 285 svc_close_xprt(&closesk->sk_xprt);
618 else if (toclose) 286 else if (toclose)
619 return -ENOENT; 287 return -ENOENT;
620 return len; 288 return len;
@@ -624,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names);
624/* 292/*
625 * Check input queue length 293 * Check input queue length
626 */ 294 */
627static int 295static int svc_recv_available(struct svc_sock *svsk)
628svc_recv_available(struct svc_sock *svsk)
629{ 296{
630 struct socket *sock = svsk->sk_sock; 297 struct socket *sock = svsk->sk_sock;
631 int avail, err; 298 int avail, err;
@@ -638,48 +305,31 @@ svc_recv_available(struct svc_sock *svsk)
638/* 305/*
639 * Generic recvfrom routine. 306 * Generic recvfrom routine.
640 */ 307 */
641static int 308static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
642svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) 309 int buflen)
643{ 310{
644 struct svc_sock *svsk = rqstp->rq_sock; 311 struct svc_sock *svsk =
312 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
645 struct msghdr msg = { 313 struct msghdr msg = {
646 .msg_flags = MSG_DONTWAIT, 314 .msg_flags = MSG_DONTWAIT,
647 }; 315 };
648 struct sockaddr *sin;
649 int len; 316 int len;
650 317
318 rqstp->rq_xprt_hlen = 0;
319
651 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, 320 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
652 msg.msg_flags); 321 msg.msg_flags);
653 322
654 /* sock_recvmsg doesn't fill in the name/namelen, so we must..
655 */
656 memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
657 rqstp->rq_addrlen = svsk->sk_remotelen;
658
659 /* Destination address in request is needed for binding the
660 * source address in RPC callbacks later.
661 */
662 sin = (struct sockaddr *)&svsk->sk_local;
663 switch (sin->sa_family) {
664 case AF_INET:
665 rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
666 break;
667 case AF_INET6:
668 rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
669 break;
670 }
671
672 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 323 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
673 svsk, iov[0].iov_base, iov[0].iov_len, len); 324 svsk, iov[0].iov_base, iov[0].iov_len, len);
674
675 return len; 325 return len;
676} 326}
677 327
678/* 328/*
679 * Set socket snd and rcv buffer lengths 329 * Set socket snd and rcv buffer lengths
680 */ 330 */
681static inline void 331static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
682svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) 332 unsigned int rcv)
683{ 333{
684#if 0 334#if 0
685 mm_segment_t oldfs; 335 mm_segment_t oldfs;
@@ -704,16 +354,16 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
704/* 354/*
705 * INET callback when data has been received on the socket. 355 * INET callback when data has been received on the socket.
706 */ 356 */
707static void 357static void svc_udp_data_ready(struct sock *sk, int count)
708svc_udp_data_ready(struct sock *sk, int count)
709{ 358{
710 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 359 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
711 360
712 if (svsk) { 361 if (svsk) {
713 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 362 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
714 svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); 363 svsk, sk, count,
715 set_bit(SK_DATA, &svsk->sk_flags); 364 test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
716 svc_sock_enqueue(svsk); 365 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
366 svc_xprt_enqueue(&svsk->sk_xprt);
717 } 367 }
718 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 368 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
719 wake_up_interruptible(sk->sk_sleep); 369 wake_up_interruptible(sk->sk_sleep);
@@ -722,15 +372,14 @@ svc_udp_data_ready(struct sock *sk, int count)
722/* 372/*
723 * INET callback when space is newly available on the socket. 373 * INET callback when space is newly available on the socket.
724 */ 374 */
725static void 375static void svc_write_space(struct sock *sk)
726svc_write_space(struct sock *sk)
727{ 376{
728 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 377 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
729 378
730 if (svsk) { 379 if (svsk) {
731 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 380 dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
732 svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); 381 svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
733 svc_sock_enqueue(svsk); 382 svc_xprt_enqueue(&svsk->sk_xprt);
734 } 383 }
735 384
736 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { 385 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -740,10 +389,19 @@ svc_write_space(struct sock *sk)
740 } 389 }
741} 390}
742 391
743static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, 392/*
744 struct cmsghdr *cmh) 393 * Copy the UDP datagram's destination address to the rqstp structure.
394 * The 'destination' address in this case is the address to which the
395 * peer sent the datagram, i.e. our local address. For multihomed
396 * hosts, this can change from msg to msg. Note that only the IP
397 * address changes, the port number should remain the same.
398 */
399static void svc_udp_get_dest_address(struct svc_rqst *rqstp,
400 struct cmsghdr *cmh)
745{ 401{
746 switch (rqstp->rq_sock->sk_sk->sk_family) { 402 struct svc_sock *svsk =
403 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
404 switch (svsk->sk_sk->sk_family) {
747 case AF_INET: { 405 case AF_INET: {
748 struct in_pktinfo *pki = CMSG_DATA(cmh); 406 struct in_pktinfo *pki = CMSG_DATA(cmh);
749 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; 407 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -760,11 +418,11 @@ static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
760/* 418/*
761 * Receive a datagram from a UDP socket. 419 * Receive a datagram from a UDP socket.
762 */ 420 */
763static int 421static int svc_udp_recvfrom(struct svc_rqst *rqstp)
764svc_udp_recvfrom(struct svc_rqst *rqstp)
765{ 422{
766 struct svc_sock *svsk = rqstp->rq_sock; 423 struct svc_sock *svsk =
767 struct svc_serv *serv = svsk->sk_server; 424 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
425 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
768 struct sk_buff *skb; 426 struct sk_buff *skb;
769 union { 427 union {
770 struct cmsghdr hdr; 428 struct cmsghdr hdr;
@@ -779,7 +437,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
779 .msg_flags = MSG_DONTWAIT, 437 .msg_flags = MSG_DONTWAIT,
780 }; 438 };
781 439
782 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 440 if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
783 /* udp sockets need large rcvbuf as all pending 441 /* udp sockets need large rcvbuf as all pending
784 * requests are still in that buffer. sndbuf must 442 * requests are still in that buffer. sndbuf must
785 * also be large enough that there is enough space 443 * also be large enough that there is enough space
@@ -792,17 +450,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
792 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 450 (serv->sv_nrthreads+3) * serv->sv_max_mesg,
793 (serv->sv_nrthreads+3) * serv->sv_max_mesg); 451 (serv->sv_nrthreads+3) * serv->sv_max_mesg);
794 452
795 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 453 clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
796 svc_sock_received(svsk);
797 return svc_deferred_recv(rqstp);
798 }
799
800 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
801 svc_delete_socket(svsk);
802 return 0;
803 }
804
805 clear_bit(SK_DATA, &svsk->sk_flags);
806 skb = NULL; 454 skb = NULL;
807 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 455 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
808 0, 0, MSG_PEEK | MSG_DONTWAIT); 456 0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,24 +461,27 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
813 if (err != -EAGAIN) { 461 if (err != -EAGAIN) {
814 /* possibly an icmp error */ 462 /* possibly an icmp error */
815 dprintk("svc: recvfrom returned error %d\n", -err); 463 dprintk("svc: recvfrom returned error %d\n", -err);
816 set_bit(SK_DATA, &svsk->sk_flags); 464 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
817 } 465 }
818 svc_sock_received(svsk); 466 svc_xprt_received(&svsk->sk_xprt);
819 return -EAGAIN; 467 return -EAGAIN;
820 } 468 }
821 rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 469 len = svc_addr_len(svc_addr(rqstp));
470 if (len < 0)
471 return len;
472 rqstp->rq_addrlen = len;
822 if (skb->tstamp.tv64 == 0) { 473 if (skb->tstamp.tv64 == 0) {
823 skb->tstamp = ktime_get_real(); 474 skb->tstamp = ktime_get_real();
824 /* Don't enable netstamp, sunrpc doesn't 475 /* Don't enable netstamp, sunrpc doesn't
825 need that much accuracy */ 476 need that much accuracy */
826 } 477 }
827 svsk->sk_sk->sk_stamp = skb->tstamp; 478 svsk->sk_sk->sk_stamp = skb->tstamp;
828 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 479 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
829 480
830 /* 481 /*
831 * Maybe more packets - kick another thread ASAP. 482 * Maybe more packets - kick another thread ASAP.
832 */ 483 */
833 svc_sock_received(svsk); 484 svc_xprt_received(&svsk->sk_xprt);
834 485
835 len = skb->len - sizeof(struct udphdr); 486 len = skb->len - sizeof(struct udphdr);
836 rqstp->rq_arg.len = len; 487 rqstp->rq_arg.len = len;
@@ -861,13 +512,14 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
861 skb_free_datagram(svsk->sk_sk, skb); 512 skb_free_datagram(svsk->sk_sk, skb);
862 } else { 513 } else {
863 /* we can use it in-place */ 514 /* we can use it in-place */
864 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 515 rqstp->rq_arg.head[0].iov_base = skb->data +
516 sizeof(struct udphdr);
865 rqstp->rq_arg.head[0].iov_len = len; 517 rqstp->rq_arg.head[0].iov_len = len;
866 if (skb_checksum_complete(skb)) { 518 if (skb_checksum_complete(skb)) {
867 skb_free_datagram(svsk->sk_sk, skb); 519 skb_free_datagram(svsk->sk_sk, skb);
868 return 0; 520 return 0;
869 } 521 }
870 rqstp->rq_skbuff = skb; 522 rqstp->rq_xprt_ctxt = skb;
871 } 523 }
872 524
873 rqstp->rq_arg.page_base = 0; 525 rqstp->rq_arg.page_base = 0;
@@ -900,27 +552,81 @@ svc_udp_sendto(struct svc_rqst *rqstp)
900 return error; 552 return error;
901} 553}
902 554
903static void 555static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
904svc_udp_init(struct svc_sock *svsk) 556{
557}
558
559static int svc_udp_has_wspace(struct svc_xprt *xprt)
560{
561 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
562 struct svc_serv *serv = xprt->xpt_server;
563 unsigned long required;
564
565 /*
566 * Set the SOCK_NOSPACE flag before checking the available
567 * sock space.
568 */
569 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
570 required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
571 if (required*2 > sock_wspace(svsk->sk_sk))
572 return 0;
573 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
574 return 1;
575}
576
577static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
578{
579 BUG();
580 return NULL;
581}
582
583static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
584 struct sockaddr *sa, int salen,
585 int flags)
586{
587 return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
588}
589
590static struct svc_xprt_ops svc_udp_ops = {
591 .xpo_create = svc_udp_create,
592 .xpo_recvfrom = svc_udp_recvfrom,
593 .xpo_sendto = svc_udp_sendto,
594 .xpo_release_rqst = svc_release_skb,
595 .xpo_detach = svc_sock_detach,
596 .xpo_free = svc_sock_free,
597 .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
598 .xpo_has_wspace = svc_udp_has_wspace,
599 .xpo_accept = svc_udp_accept,
600};
601
602static struct svc_xprt_class svc_udp_class = {
603 .xcl_name = "udp",
604 .xcl_owner = THIS_MODULE,
605 .xcl_ops = &svc_udp_ops,
606 .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
607};
608
609static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
905{ 610{
906 int one = 1; 611 int one = 1;
907 mm_segment_t oldfs; 612 mm_segment_t oldfs;
908 613
614 svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
615 clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
909 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 616 svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
910 svsk->sk_sk->sk_write_space = svc_write_space; 617 svsk->sk_sk->sk_write_space = svc_write_space;
911 svsk->sk_recvfrom = svc_udp_recvfrom;
912 svsk->sk_sendto = svc_udp_sendto;
913 618
914 /* initialise setting must have enough space to 619 /* initialise setting must have enough space to
915 * receive and respond to one request. 620 * receive and respond to one request.
916 * svc_udp_recvfrom will re-adjust if necessary 621 * svc_udp_recvfrom will re-adjust if necessary
917 */ 622 */
918 svc_sock_setbufsize(svsk->sk_sock, 623 svc_sock_setbufsize(svsk->sk_sock,
919 3 * svsk->sk_server->sv_max_mesg, 624 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
920 3 * svsk->sk_server->sv_max_mesg); 625 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
921 626
922 set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ 627 /* data might have come in before data_ready set up */
923 set_bit(SK_CHNGBUF, &svsk->sk_flags); 628 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
629 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
924 630
925 oldfs = get_fs(); 631 oldfs = get_fs();
926 set_fs(KERNEL_DS); 632 set_fs(KERNEL_DS);
@@ -934,8 +640,7 @@ svc_udp_init(struct svc_sock *svsk)
934 * A data_ready event on a listening socket means there's a connection 640 * A data_ready event on a listening socket means there's a connection
935 * pending. Do not use state_change as a substitute for it. 641 * pending. Do not use state_change as a substitute for it.
936 */ 642 */
937static void 643static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
938svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
939{ 644{
940 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 645 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
941 646
@@ -954,8 +659,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
954 */ 659 */
955 if (sk->sk_state == TCP_LISTEN) { 660 if (sk->sk_state == TCP_LISTEN) {
956 if (svsk) { 661 if (svsk) {
957 set_bit(SK_CONN, &svsk->sk_flags); 662 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
958 svc_sock_enqueue(svsk); 663 svc_xprt_enqueue(&svsk->sk_xprt);
959 } else 664 } else
960 printk("svc: socket %p: no user data\n", sk); 665 printk("svc: socket %p: no user data\n", sk);
961 } 666 }
@@ -967,8 +672,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
967/* 672/*
968 * A state change on a connected socket means it's dying or dead. 673 * A state change on a connected socket means it's dying or dead.
969 */ 674 */
970static void 675static void svc_tcp_state_change(struct sock *sk)
971svc_tcp_state_change(struct sock *sk)
972{ 676{
973 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 677 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
974 678
@@ -978,51 +682,36 @@ svc_tcp_state_change(struct sock *sk)
978 if (!svsk) 682 if (!svsk)
979 printk("svc: socket %p: no user data\n", sk); 683 printk("svc: socket %p: no user data\n", sk);
980 else { 684 else {
981 set_bit(SK_CLOSE, &svsk->sk_flags); 685 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
982 svc_sock_enqueue(svsk); 686 svc_xprt_enqueue(&svsk->sk_xprt);
983 } 687 }
984 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 688 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
985 wake_up_interruptible_all(sk->sk_sleep); 689 wake_up_interruptible_all(sk->sk_sleep);
986} 690}
987 691
988static void 692static void svc_tcp_data_ready(struct sock *sk, int count)
989svc_tcp_data_ready(struct sock *sk, int count)
990{ 693{
991 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 694 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
992 695
993 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 696 dprintk("svc: socket %p TCP data ready (svsk %p)\n",
994 sk, sk->sk_user_data); 697 sk, sk->sk_user_data);
995 if (svsk) { 698 if (svsk) {
996 set_bit(SK_DATA, &svsk->sk_flags); 699 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
997 svc_sock_enqueue(svsk); 700 svc_xprt_enqueue(&svsk->sk_xprt);
998 } 701 }
999 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 702 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1000 wake_up_interruptible(sk->sk_sleep); 703 wake_up_interruptible(sk->sk_sleep);
1001} 704}
1002 705
1003static inline int svc_port_is_privileged(struct sockaddr *sin)
1004{
1005 switch (sin->sa_family) {
1006 case AF_INET:
1007 return ntohs(((struct sockaddr_in *)sin)->sin_port)
1008 < PROT_SOCK;
1009 case AF_INET6:
1010 return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
1011 < PROT_SOCK;
1012 default:
1013 return 0;
1014 }
1015}
1016
1017/* 706/*
1018 * Accept a TCP connection 707 * Accept a TCP connection
1019 */ 708 */
1020static void 709static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
1021svc_tcp_accept(struct svc_sock *svsk)
1022{ 710{
711 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1023 struct sockaddr_storage addr; 712 struct sockaddr_storage addr;
1024 struct sockaddr *sin = (struct sockaddr *) &addr; 713 struct sockaddr *sin = (struct sockaddr *) &addr;
1025 struct svc_serv *serv = svsk->sk_server; 714 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1026 struct socket *sock = svsk->sk_sock; 715 struct socket *sock = svsk->sk_sock;
1027 struct socket *newsock; 716 struct socket *newsock;
1028 struct svc_sock *newsvsk; 717 struct svc_sock *newsvsk;
@@ -1031,9 +720,9 @@ svc_tcp_accept(struct svc_sock *svsk)
1031 720
1032 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 721 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
1033 if (!sock) 722 if (!sock)
1034 return; 723 return NULL;
1035 724
1036 clear_bit(SK_CONN, &svsk->sk_flags); 725 clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1037 err = kernel_accept(sock, &newsock, O_NONBLOCK); 726 err = kernel_accept(sock, &newsock, O_NONBLOCK);
1038 if (err < 0) { 727 if (err < 0) {
1039 if (err == -ENOMEM) 728 if (err == -ENOMEM)
@@ -1042,11 +731,9 @@ svc_tcp_accept(struct svc_sock *svsk)
1042 else if (err != -EAGAIN && net_ratelimit()) 731 else if (err != -EAGAIN && net_ratelimit())
1043 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 732 printk(KERN_WARNING "%s: accept failed (err %d)!\n",
1044 serv->sv_name, -err); 733 serv->sv_name, -err);
1045 return; 734 return NULL;
1046 } 735 }
1047 736 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1048 set_bit(SK_CONN, &svsk->sk_flags);
1049 svc_sock_enqueue(svsk);
1050 737
1051 err = kernel_getpeername(newsock, sin, &slen); 738 err = kernel_getpeername(newsock, sin, &slen);
1052 if (err < 0) { 739 if (err < 0) {
@@ -1077,106 +764,42 @@ svc_tcp_accept(struct svc_sock *svsk)
1077 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 764 if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
1078 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) 765 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
1079 goto failed; 766 goto failed;
1080 memcpy(&newsvsk->sk_remote, sin, slen); 767 svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
1081 newsvsk->sk_remotelen = slen;
1082 err = kernel_getsockname(newsock, sin, &slen); 768 err = kernel_getsockname(newsock, sin, &slen);
1083 if (unlikely(err < 0)) { 769 if (unlikely(err < 0)) {
1084 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); 770 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
1085 slen = offsetof(struct sockaddr, sa_data); 771 slen = offsetof(struct sockaddr, sa_data);
1086 } 772 }
1087 memcpy(&newsvsk->sk_local, sin, slen); 773 svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
1088
1089 svc_sock_received(newsvsk);
1090
1091 /* make sure that we don't have too many active connections.
1092 * If we have, something must be dropped.
1093 *
1094 * There's no point in trying to do random drop here for
1095 * DoS prevention. The NFS clients does 1 reconnect in 15
1096 * seconds. An attacker can easily beat that.
1097 *
1098 * The only somewhat efficient mechanism would be if drop
1099 * old connections from the same IP first. But right now
1100 * we don't even record the client IP in svc_sock.
1101 */
1102 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
1103 struct svc_sock *svsk = NULL;
1104 spin_lock_bh(&serv->sv_lock);
1105 if (!list_empty(&serv->sv_tempsocks)) {
1106 if (net_ratelimit()) {
1107 /* Try to help the admin */
1108 printk(KERN_NOTICE "%s: too many open TCP "
1109 "sockets, consider increasing the "
1110 "number of nfsd threads\n",
1111 serv->sv_name);
1112 printk(KERN_NOTICE
1113 "%s: last TCP connect from %s\n",
1114 serv->sv_name, __svc_print_addr(sin,
1115 buf, sizeof(buf)));
1116 }
1117 /*
1118 * Always select the oldest socket. It's not fair,
1119 * but so is life
1120 */
1121 svsk = list_entry(serv->sv_tempsocks.prev,
1122 struct svc_sock,
1123 sk_list);
1124 set_bit(SK_CLOSE, &svsk->sk_flags);
1125 atomic_inc(&svsk->sk_inuse);
1126 }
1127 spin_unlock_bh(&serv->sv_lock);
1128
1129 if (svsk) {
1130 svc_sock_enqueue(svsk);
1131 svc_sock_put(svsk);
1132 }
1133
1134 }
1135 774
1136 if (serv->sv_stats) 775 if (serv->sv_stats)
1137 serv->sv_stats->nettcpconn++; 776 serv->sv_stats->nettcpconn++;
1138 777
1139 return; 778 return &newsvsk->sk_xprt;
1140 779
1141failed: 780failed:
1142 sock_release(newsock); 781 sock_release(newsock);
1143 return; 782 return NULL;
1144} 783}
1145 784
1146/* 785/*
1147 * Receive data from a TCP socket. 786 * Receive data from a TCP socket.
1148 */ 787 */
1149static int 788static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
1150svc_tcp_recvfrom(struct svc_rqst *rqstp)
1151{ 789{
1152 struct svc_sock *svsk = rqstp->rq_sock; 790 struct svc_sock *svsk =
1153 struct svc_serv *serv = svsk->sk_server; 791 container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
792 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1154 int len; 793 int len;
1155 struct kvec *vec; 794 struct kvec *vec;
1156 int pnum, vlen; 795 int pnum, vlen;
1157 796
1158 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 797 dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
1159 svsk, test_bit(SK_DATA, &svsk->sk_flags), 798 svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
1160 test_bit(SK_CONN, &svsk->sk_flags), 799 test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
1161 test_bit(SK_CLOSE, &svsk->sk_flags)); 800 test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
1162 801
1163 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 802 if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
1164 svc_sock_received(svsk);
1165 return svc_deferred_recv(rqstp);
1166 }
1167
1168 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
1169 svc_delete_socket(svsk);
1170 return 0;
1171 }
1172
1173 if (svsk->sk_sk->sk_state == TCP_LISTEN) {
1174 svc_tcp_accept(svsk);
1175 svc_sock_received(svsk);
1176 return 0;
1177 }
1178
1179 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
1180 /* sndbuf needs to have room for one request 803 /* sndbuf needs to have room for one request
1181 * per thread, otherwise we can stall even when the 804 * per thread, otherwise we can stall even when the
1182 * network isn't a bottleneck. 805 * network isn't a bottleneck.
@@ -1193,7 +816,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1193 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 816 (serv->sv_nrthreads+3) * serv->sv_max_mesg,
1194 3 * serv->sv_max_mesg); 817 3 * serv->sv_max_mesg);
1195 818
1196 clear_bit(SK_DATA, &svsk->sk_flags); 819 clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1197 820
1198 /* Receive data. If we haven't got the record length yet, get 821 /* Receive data. If we haven't got the record length yet, get
1199 * the next four bytes. Otherwise try to gobble up as much as 822 * the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +835,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1212 if (len < want) { 835 if (len < want) {
1213 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 836 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
1214 len, want); 837 len, want);
1215 svc_sock_received(svsk); 838 svc_xprt_received(&svsk->sk_xprt);
1216 return -EAGAIN; /* record header not complete */ 839 return -EAGAIN; /* record header not complete */
1217 } 840 }
1218 841
@@ -1248,11 +871,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1248 if (len < svsk->sk_reclen) { 871 if (len < svsk->sk_reclen) {
1249 dprintk("svc: incomplete TCP record (%d of %d)\n", 872 dprintk("svc: incomplete TCP record (%d of %d)\n",
1250 len, svsk->sk_reclen); 873 len, svsk->sk_reclen);
1251 svc_sock_received(svsk); 874 svc_xprt_received(&svsk->sk_xprt);
1252 return -EAGAIN; /* record not complete */ 875 return -EAGAIN; /* record not complete */
1253 } 876 }
1254 len = svsk->sk_reclen; 877 len = svsk->sk_reclen;
1255 set_bit(SK_DATA, &svsk->sk_flags); 878 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1256 879
1257 vec = rqstp->rq_vec; 880 vec = rqstp->rq_vec;
1258 vec[0] = rqstp->rq_arg.head[0]; 881 vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +904,31 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1281 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 904 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
1282 } 905 }
1283 906
1284 rqstp->rq_skbuff = NULL; 907 rqstp->rq_xprt_ctxt = NULL;
1285 rqstp->rq_prot = IPPROTO_TCP; 908 rqstp->rq_prot = IPPROTO_TCP;
1286 909
1287 /* Reset TCP read info */ 910 /* Reset TCP read info */
1288 svsk->sk_reclen = 0; 911 svsk->sk_reclen = 0;
1289 svsk->sk_tcplen = 0; 912 svsk->sk_tcplen = 0;
1290 913
1291 svc_sock_received(svsk); 914 svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
915 svc_xprt_received(&svsk->sk_xprt);
1292 if (serv->sv_stats) 916 if (serv->sv_stats)
1293 serv->sv_stats->nettcpcnt++; 917 serv->sv_stats->nettcpcnt++;
1294 918
1295 return len; 919 return len;
1296 920
1297 err_delete: 921 err_delete:
1298 svc_delete_socket(svsk); 922 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1299 return -EAGAIN; 923 return -EAGAIN;
1300 924
1301 error: 925 error:
1302 if (len == -EAGAIN) { 926 if (len == -EAGAIN) {
1303 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 927 dprintk("RPC: TCP recvfrom got EAGAIN\n");
1304 svc_sock_received(svsk); 928 svc_xprt_received(&svsk->sk_xprt);
1305 } else { 929 } else {
1306 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 930 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
1307 svsk->sk_server->sv_name, -len); 931 svsk->sk_xprt.xpt_server->sv_name, -len);
1308 goto err_delete; 932 goto err_delete;
1309 } 933 }
1310 934
@@ -1314,8 +938,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1314/* 938/*
1315 * Send out data on TCP socket. 939 * Send out data on TCP socket.
1316 */ 940 */
1317static int 941static int svc_tcp_sendto(struct svc_rqst *rqstp)
1318svc_tcp_sendto(struct svc_rqst *rqstp)
1319{ 942{
1320 struct xdr_buf *xbufp = &rqstp->rq_res; 943 struct xdr_buf *xbufp = &rqstp->rq_res;
1321 int sent; 944 int sent;
@@ -1328,35 +951,109 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
1328 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 951 reclen = htonl(0x80000000|((xbufp->len ) - 4));
1329 memcpy(xbufp->head[0].iov_base, &reclen, 4); 952 memcpy(xbufp->head[0].iov_base, &reclen, 4);
1330 953
1331 if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) 954 if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
1332 return -ENOTCONN; 955 return -ENOTCONN;
1333 956
1334 sent = svc_sendto(rqstp, &rqstp->rq_res); 957 sent = svc_sendto(rqstp, &rqstp->rq_res);
1335 if (sent != xbufp->len) { 958 if (sent != xbufp->len) {
1336 printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", 959 printk(KERN_NOTICE
1337 rqstp->rq_sock->sk_server->sv_name, 960 "rpc-srv/tcp: %s: %s %d when sending %d bytes "
961 "- shutting down socket\n",
962 rqstp->rq_xprt->xpt_server->sv_name,
1338 (sent<0)?"got error":"sent only", 963 (sent<0)?"got error":"sent only",
1339 sent, xbufp->len); 964 sent, xbufp->len);
1340 set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); 965 set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
1341 svc_sock_enqueue(rqstp->rq_sock); 966 svc_xprt_enqueue(rqstp->rq_xprt);
1342 sent = -EAGAIN; 967 sent = -EAGAIN;
1343 } 968 }
1344 return sent; 969 return sent;
1345} 970}
1346 971
1347static void 972/*
1348svc_tcp_init(struct svc_sock *svsk) 973 * Setup response header. TCP has a 4B record length field.
974 */
975static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
976{
977 struct kvec *resv = &rqstp->rq_res.head[0];
978
979 /* tcp needs a space for the record length... */
980 svc_putnl(resv, 0);
981}
982
983static int svc_tcp_has_wspace(struct svc_xprt *xprt)
984{
985 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
986 struct svc_serv *serv = svsk->sk_xprt.xpt_server;
987 int required;
988 int wspace;
989
990 /*
991 * Set the SOCK_NOSPACE flag before checking the available
992 * sock space.
993 */
994 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
995 required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
996 wspace = sk_stream_wspace(svsk->sk_sk);
997
998 if (wspace < sk_stream_min_wspace(svsk->sk_sk))
999 return 0;
1000 if (required * 2 > wspace)
1001 return 0;
1002
1003 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
1004 return 1;
1005}
1006
1007static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
1008 struct sockaddr *sa, int salen,
1009 int flags)
1010{
1011 return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
1012}
1013
1014static struct svc_xprt_ops svc_tcp_ops = {
1015 .xpo_create = svc_tcp_create,
1016 .xpo_recvfrom = svc_tcp_recvfrom,
1017 .xpo_sendto = svc_tcp_sendto,
1018 .xpo_release_rqst = svc_release_skb,
1019 .xpo_detach = svc_sock_detach,
1020 .xpo_free = svc_sock_free,
1021 .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
1022 .xpo_has_wspace = svc_tcp_has_wspace,
1023 .xpo_accept = svc_tcp_accept,
1024};
1025
1026static struct svc_xprt_class svc_tcp_class = {
1027 .xcl_name = "tcp",
1028 .xcl_owner = THIS_MODULE,
1029 .xcl_ops = &svc_tcp_ops,
1030 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
1031};
1032
1033void svc_init_xprt_sock(void)
1034{
1035 svc_reg_xprt_class(&svc_tcp_class);
1036 svc_reg_xprt_class(&svc_udp_class);
1037}
1038
1039void svc_cleanup_xprt_sock(void)
1040{
1041 svc_unreg_xprt_class(&svc_tcp_class);
1042 svc_unreg_xprt_class(&svc_udp_class);
1043}
1044
1045static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
1349{ 1046{
1350 struct sock *sk = svsk->sk_sk; 1047 struct sock *sk = svsk->sk_sk;
1351 struct tcp_sock *tp = tcp_sk(sk); 1048 struct tcp_sock *tp = tcp_sk(sk);
1352 1049
1353 svsk->sk_recvfrom = svc_tcp_recvfrom; 1050 svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
1354 svsk->sk_sendto = svc_tcp_sendto; 1051 set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
1355
1356 if (sk->sk_state == TCP_LISTEN) { 1052 if (sk->sk_state == TCP_LISTEN) {
1357 dprintk("setting up TCP socket for listening\n"); 1053 dprintk("setting up TCP socket for listening\n");
1054 set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
1358 sk->sk_data_ready = svc_tcp_listen_data_ready; 1055 sk->sk_data_ready = svc_tcp_listen_data_ready;
1359 set_bit(SK_CONN, &svsk->sk_flags); 1056 set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1360 } else { 1057 } else {
1361 dprintk("setting up TCP socket for reading\n"); 1058 dprintk("setting up TCP socket for reading\n");
1362 sk->sk_state_change = svc_tcp_state_change; 1059 sk->sk_state_change = svc_tcp_state_change;
@@ -1373,18 +1070,17 @@ svc_tcp_init(struct svc_sock *svsk)
1373 * svc_tcp_recvfrom will re-adjust if necessary 1070 * svc_tcp_recvfrom will re-adjust if necessary
1374 */ 1071 */
1375 svc_sock_setbufsize(svsk->sk_sock, 1072 svc_sock_setbufsize(svsk->sk_sock,
1376 3 * svsk->sk_server->sv_max_mesg, 1073 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
1377 3 * svsk->sk_server->sv_max_mesg); 1074 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
1378 1075
1379 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1076 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1380 set_bit(SK_DATA, &svsk->sk_flags); 1077 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1381 if (sk->sk_state != TCP_ESTABLISHED) 1078 if (sk->sk_state != TCP_ESTABLISHED)
1382 set_bit(SK_CLOSE, &svsk->sk_flags); 1079 set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1383 } 1080 }
1384} 1081}
1385 1082
1386void 1083void svc_sock_update_bufs(struct svc_serv *serv)
1387svc_sock_update_bufs(struct svc_serv *serv)
1388{ 1084{
1389 /* 1085 /*
1390 * The number of server threads has changed. Update 1086 * The number of server threads has changed. Update
@@ -1395,232 +1091,18 @@ svc_sock_update_bufs(struct svc_serv *serv)
1395 spin_lock_bh(&serv->sv_lock); 1091 spin_lock_bh(&serv->sv_lock);
1396 list_for_each(le, &serv->sv_permsocks) { 1092 list_for_each(le, &serv->sv_permsocks) {
1397 struct svc_sock *svsk = 1093 struct svc_sock *svsk =
1398 list_entry(le, struct svc_sock, sk_list); 1094 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1399 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1095 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1400 } 1096 }
1401 list_for_each(le, &serv->sv_tempsocks) { 1097 list_for_each(le, &serv->sv_tempsocks) {
1402 struct svc_sock *svsk = 1098 struct svc_sock *svsk =
1403 list_entry(le, struct svc_sock, sk_list); 1099 list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1404 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1100 set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1405 } 1101 }
1406 spin_unlock_bh(&serv->sv_lock); 1102 spin_unlock_bh(&serv->sv_lock);
1407} 1103}
1408 1104
1409/* 1105/*
1410 * Receive the next request on any socket. This code is carefully
1411 * organised not to touch any cachelines in the shared svc_serv
1412 * structure, only cachelines in the local svc_pool.
1413 */
1414int
1415svc_recv(struct svc_rqst *rqstp, long timeout)
1416{
1417 struct svc_sock *svsk = NULL;
1418 struct svc_serv *serv = rqstp->rq_server;
1419 struct svc_pool *pool = rqstp->rq_pool;
1420 int len, i;
1421 int pages;
1422 struct xdr_buf *arg;
1423 DECLARE_WAITQUEUE(wait, current);
1424
1425 dprintk("svc: server %p waiting for data (to = %ld)\n",
1426 rqstp, timeout);
1427
1428 if (rqstp->rq_sock)
1429 printk(KERN_ERR
1430 "svc_recv: service %p, socket not NULL!\n",
1431 rqstp);
1432 if (waitqueue_active(&rqstp->rq_wait))
1433 printk(KERN_ERR
1434 "svc_recv: service %p, wait queue active!\n",
1435 rqstp);
1436
1437
1438 /* now allocate needed pages. If we get a failure, sleep briefly */
1439 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
1440 for (i=0; i < pages ; i++)
1441 while (rqstp->rq_pages[i] == NULL) {
1442 struct page *p = alloc_page(GFP_KERNEL);
1443 if (!p)
1444 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
1445 rqstp->rq_pages[i] = p;
1446 }
1447 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
1448 BUG_ON(pages >= RPCSVC_MAXPAGES);
1449
1450 /* Make arg->head point to first page and arg->pages point to rest */
1451 arg = &rqstp->rq_arg;
1452 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
1453 arg->head[0].iov_len = PAGE_SIZE;
1454 arg->pages = rqstp->rq_pages + 1;
1455 arg->page_base = 0;
1456 /* save at least one page for response */
1457 arg->page_len = (pages-2)*PAGE_SIZE;
1458 arg->len = (pages-1)*PAGE_SIZE;
1459 arg->tail[0].iov_len = 0;
1460
1461 try_to_freeze();
1462 cond_resched();
1463 if (signalled())
1464 return -EINTR;
1465
1466 spin_lock_bh(&pool->sp_lock);
1467 if ((svsk = svc_sock_dequeue(pool)) != NULL) {
1468 rqstp->rq_sock = svsk;
1469 atomic_inc(&svsk->sk_inuse);
1470 rqstp->rq_reserved = serv->sv_max_mesg;
1471 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
1472 } else {
1473 /* No data pending. Go to sleep */
1474 svc_thread_enqueue(pool, rqstp);
1475
1476 /*
1477 * We have to be able to interrupt this wait
1478 * to bring down the daemons ...
1479 */
1480 set_current_state(TASK_INTERRUPTIBLE);
1481 add_wait_queue(&rqstp->rq_wait, &wait);
1482 spin_unlock_bh(&pool->sp_lock);
1483
1484 schedule_timeout(timeout);
1485
1486 try_to_freeze();
1487
1488 spin_lock_bh(&pool->sp_lock);
1489 remove_wait_queue(&rqstp->rq_wait, &wait);
1490
1491 if (!(svsk = rqstp->rq_sock)) {
1492 svc_thread_dequeue(pool, rqstp);
1493 spin_unlock_bh(&pool->sp_lock);
1494 dprintk("svc: server %p, no data yet\n", rqstp);
1495 return signalled()? -EINTR : -EAGAIN;
1496 }
1497 }
1498 spin_unlock_bh(&pool->sp_lock);
1499
1500 dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
1501 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
1502 len = svsk->sk_recvfrom(rqstp);
1503 dprintk("svc: got len=%d\n", len);
1504
1505 /* No data, incomplete (TCP) read, or accept() */
1506 if (len == 0 || len == -EAGAIN) {
1507 rqstp->rq_res.len = 0;
1508 svc_sock_release(rqstp);
1509 return -EAGAIN;
1510 }
1511 svsk->sk_lastrecv = get_seconds();
1512 clear_bit(SK_OLD, &svsk->sk_flags);
1513
1514 rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
1515 rqstp->rq_chandle.defer = svc_defer;
1516
1517 if (serv->sv_stats)
1518 serv->sv_stats->netcnt++;
1519 return len;
1520}
1521
1522/*
1523 * Drop request
1524 */
1525void
1526svc_drop(struct svc_rqst *rqstp)
1527{
1528 dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
1529 svc_sock_release(rqstp);
1530}
1531
1532/*
1533 * Return reply to client.
1534 */
1535int
1536svc_send(struct svc_rqst *rqstp)
1537{
1538 struct svc_sock *svsk;
1539 int len;
1540 struct xdr_buf *xb;
1541
1542 if ((svsk = rqstp->rq_sock) == NULL) {
1543 printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
1544 __FILE__, __LINE__);
1545 return -EFAULT;
1546 }
1547
1548 /* release the receive skb before sending the reply */
1549 svc_release_skb(rqstp);
1550
1551 /* calculate over-all length */
1552 xb = & rqstp->rq_res;
1553 xb->len = xb->head[0].iov_len +
1554 xb->page_len +
1555 xb->tail[0].iov_len;
1556
1557 /* Grab svsk->sk_mutex to serialize outgoing data. */
1558 mutex_lock(&svsk->sk_mutex);
1559 if (test_bit(SK_DEAD, &svsk->sk_flags))
1560 len = -ENOTCONN;
1561 else
1562 len = svsk->sk_sendto(rqstp);
1563 mutex_unlock(&svsk->sk_mutex);
1564 svc_sock_release(rqstp);
1565
1566 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
1567 return 0;
1568 return len;
1569}
1570
1571/*
1572 * Timer function to close old temporary sockets, using
1573 * a mark-and-sweep algorithm.
1574 */
1575static void
1576svc_age_temp_sockets(unsigned long closure)
1577{
1578 struct svc_serv *serv = (struct svc_serv *)closure;
1579 struct svc_sock *svsk;
1580 struct list_head *le, *next;
1581 LIST_HEAD(to_be_aged);
1582
1583 dprintk("svc_age_temp_sockets\n");
1584
1585 if (!spin_trylock_bh(&serv->sv_lock)) {
1586 /* busy, try again 1 sec later */
1587 dprintk("svc_age_temp_sockets: busy\n");
1588 mod_timer(&serv->sv_temptimer, jiffies + HZ);
1589 return;
1590 }
1591
1592 list_for_each_safe(le, next, &serv->sv_tempsocks) {
1593 svsk = list_entry(le, struct svc_sock, sk_list);
1594
1595 if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
1596 continue;
1597 if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
1598 continue;
1599 atomic_inc(&svsk->sk_inuse);
1600 list_move(le, &to_be_aged);
1601 set_bit(SK_CLOSE, &svsk->sk_flags);
1602 set_bit(SK_DETACHED, &svsk->sk_flags);
1603 }
1604 spin_unlock_bh(&serv->sv_lock);
1605
1606 while (!list_empty(&to_be_aged)) {
1607 le = to_be_aged.next;
1608 /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
1609 list_del_init(le);
1610 svsk = list_entry(le, struct svc_sock, sk_list);
1611
1612 dprintk("queuing svsk %p for closing, %lu seconds old\n",
1613 svsk, get_seconds() - svsk->sk_lastrecv);
1614
1615 /* a thread will dequeue and close it soon */
1616 svc_sock_enqueue(svsk);
1617 svc_sock_put(svsk);
1618 }
1619
1620 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
1621}
1622
1623/*
1624 * Initialize socket for RPC use and create svc_sock struct 1106 * Initialize socket for RPC use and create svc_sock struct
1625 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. 1107 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
1626 */ 1108 */
@@ -1631,7 +1113,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1631 struct svc_sock *svsk; 1113 struct svc_sock *svsk;
1632 struct sock *inet; 1114 struct sock *inet;
1633 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1115 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1634 int is_temporary = flags & SVC_SOCK_TEMPORARY;
1635 1116
1636 dprintk("svc: svc_setup_socket %p\n", sock); 1117 dprintk("svc: svc_setup_socket %p\n", sock);
1637 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1118 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1132,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1651 return NULL; 1132 return NULL;
1652 } 1133 }
1653 1134
1654 set_bit(SK_BUSY, &svsk->sk_flags);
1655 inet->sk_user_data = svsk; 1135 inet->sk_user_data = svsk;
1656 svsk->sk_sock = sock; 1136 svsk->sk_sock = sock;
1657 svsk->sk_sk = inet; 1137 svsk->sk_sk = inet;
1658 svsk->sk_ostate = inet->sk_state_change; 1138 svsk->sk_ostate = inet->sk_state_change;
1659 svsk->sk_odata = inet->sk_data_ready; 1139 svsk->sk_odata = inet->sk_data_ready;
1660 svsk->sk_owspace = inet->sk_write_space; 1140 svsk->sk_owspace = inet->sk_write_space;
1661 svsk->sk_server = serv;
1662 atomic_set(&svsk->sk_inuse, 1);
1663 svsk->sk_lastrecv = get_seconds();
1664 spin_lock_init(&svsk->sk_lock);
1665 INIT_LIST_HEAD(&svsk->sk_deferred);
1666 INIT_LIST_HEAD(&svsk->sk_ready);
1667 mutex_init(&svsk->sk_mutex);
1668 1141
1669 /* Initialize the socket */ 1142 /* Initialize the socket */
1670 if (sock->type == SOCK_DGRAM) 1143 if (sock->type == SOCK_DGRAM)
1671 svc_udp_init(svsk); 1144 svc_udp_init(svsk, serv);
1672 else 1145 else
1673 svc_tcp_init(svsk); 1146 svc_tcp_init(svsk, serv);
1674
1675 spin_lock_bh(&serv->sv_lock);
1676 if (is_temporary) {
1677 set_bit(SK_TEMP, &svsk->sk_flags);
1678 list_add(&svsk->sk_list, &serv->sv_tempsocks);
1679 serv->sv_tmpcnt++;
1680 if (serv->sv_temptimer.function == NULL) {
1681 /* setup timer to age temp sockets */
1682 setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
1683 (unsigned long)serv);
1684 mod_timer(&serv->sv_temptimer,
1685 jiffies + svc_conn_age_period * HZ);
1686 }
1687 } else {
1688 clear_bit(SK_TEMP, &svsk->sk_flags);
1689 list_add(&svsk->sk_list, &serv->sv_permsocks);
1690 }
1691 spin_unlock_bh(&serv->sv_lock);
1692 1147
1693 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1148 dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1694 svsk, svsk->sk_sk); 1149 svsk, svsk->sk_sk);
@@ -1717,7 +1172,16 @@ int svc_addsock(struct svc_serv *serv,
1717 else { 1172 else {
1718 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); 1173 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
1719 if (svsk) { 1174 if (svsk) {
1720 svc_sock_received(svsk); 1175 struct sockaddr_storage addr;
1176 struct sockaddr *sin = (struct sockaddr *)&addr;
1177 int salen;
1178 if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
1179 svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
1180 clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
1181 spin_lock_bh(&serv->sv_lock);
1182 list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
1183 spin_unlock_bh(&serv->sv_lock);
1184 svc_xprt_received(&svsk->sk_xprt);
1721 err = 0; 1185 err = 0;
1722 } 1186 }
1723 } 1187 }
@@ -1733,14 +1197,19 @@ EXPORT_SYMBOL_GPL(svc_addsock);
1733/* 1197/*
1734 * Create socket for RPC service. 1198 * Create socket for RPC service.
1735 */ 1199 */
1736static int svc_create_socket(struct svc_serv *serv, int protocol, 1200static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1737 struct sockaddr *sin, int len, int flags) 1201 int protocol,
1202 struct sockaddr *sin, int len,
1203 int flags)
1738{ 1204{
1739 struct svc_sock *svsk; 1205 struct svc_sock *svsk;
1740 struct socket *sock; 1206 struct socket *sock;
1741 int error; 1207 int error;
1742 int type; 1208 int type;
1743 char buf[RPC_MAX_ADDRBUFLEN]; 1209 char buf[RPC_MAX_ADDRBUFLEN];
1210 struct sockaddr_storage addr;
1211 struct sockaddr *newsin = (struct sockaddr *)&addr;
1212 int newlen;
1744 1213
1745 dprintk("svc: svc_create_socket(%s, %d, %s)\n", 1214 dprintk("svc: svc_create_socket(%s, %d, %s)\n",
1746 serv->sv_program->pg_name, protocol, 1215 serv->sv_program->pg_name, protocol,
@@ -1749,13 +1218,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
1749 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1218 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
1750 printk(KERN_WARNING "svc: only UDP and TCP " 1219 printk(KERN_WARNING "svc: only UDP and TCP "
1751 "sockets supported\n"); 1220 "sockets supported\n");
1752 return -EINVAL; 1221 return ERR_PTR(-EINVAL);
1753 } 1222 }
1754 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1223 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1755 1224
1756 error = sock_create_kern(sin->sa_family, type, protocol, &sock); 1225 error = sock_create_kern(sin->sa_family, type, protocol, &sock);
1757 if (error < 0) 1226 if (error < 0)
1758 return error; 1227 return ERR_PTR(error);
1759 1228
1760 svc_reclassify_socket(sock); 1229 svc_reclassify_socket(sock);
1761 1230
@@ -1765,203 +1234,55 @@ static int svc_create_socket(struct svc_serv *serv, int protocol,
1765 if (error < 0) 1234 if (error < 0)
1766 goto bummer; 1235 goto bummer;
1767 1236
1237 newlen = len;
1238 error = kernel_getsockname(sock, newsin, &newlen);
1239 if (error < 0)
1240 goto bummer;
1241
1768 if (protocol == IPPROTO_TCP) { 1242 if (protocol == IPPROTO_TCP) {
1769 if ((error = kernel_listen(sock, 64)) < 0) 1243 if ((error = kernel_listen(sock, 64)) < 0)
1770 goto bummer; 1244 goto bummer;
1771 } 1245 }
1772 1246
1773 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { 1247 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
1774 svc_sock_received(svsk); 1248 svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
1775 return ntohs(inet_sk(svsk->sk_sk)->sport); 1249 return (struct svc_xprt *)svsk;
1776 } 1250 }
1777 1251
1778bummer: 1252bummer:
1779 dprintk("svc: svc_create_socket error = %d\n", -error); 1253 dprintk("svc: svc_create_socket error = %d\n", -error);
1780 sock_release(sock); 1254 sock_release(sock);
1781 return error; 1255 return ERR_PTR(error);
1782} 1256}
1783 1257
1784/* 1258/*
1785 * Remove a dead socket 1259 * Detach the svc_sock from the socket so that no
1260 * more callbacks occur.
1786 */ 1261 */
1787static void 1262static void svc_sock_detach(struct svc_xprt *xprt)
1788svc_delete_socket(struct svc_sock *svsk)
1789{ 1263{
1790 struct svc_serv *serv; 1264 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1791 struct sock *sk; 1265 struct sock *sk = svsk->sk_sk;
1792
1793 dprintk("svc: svc_delete_socket(%p)\n", svsk);
1794 1266
1795 serv = svsk->sk_server; 1267 dprintk("svc: svc_sock_detach(%p)\n", svsk);
1796 sk = svsk->sk_sk;
1797 1268
1269 /* put back the old socket callbacks */
1798 sk->sk_state_change = svsk->sk_ostate; 1270 sk->sk_state_change = svsk->sk_ostate;
1799 sk->sk_data_ready = svsk->sk_odata; 1271 sk->sk_data_ready = svsk->sk_odata;
1800 sk->sk_write_space = svsk->sk_owspace; 1272 sk->sk_write_space = svsk->sk_owspace;
1801
1802 spin_lock_bh(&serv->sv_lock);
1803
1804 if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
1805 list_del_init(&svsk->sk_list);
1806 /*
1807 * We used to delete the svc_sock from whichever list
1808 * it's sk_ready node was on, but we don't actually
1809 * need to. This is because the only time we're called
1810 * while still attached to a queue, the queue itself
1811 * is about to be destroyed (in svc_destroy).
1812 */
1813 if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
1814 BUG_ON(atomic_read(&svsk->sk_inuse)<2);
1815 atomic_dec(&svsk->sk_inuse);
1816 if (test_bit(SK_TEMP, &svsk->sk_flags))
1817 serv->sv_tmpcnt--;
1818 }
1819
1820 spin_unlock_bh(&serv->sv_lock);
1821}
1822
1823static void svc_close_socket(struct svc_sock *svsk)
1824{
1825 set_bit(SK_CLOSE, &svsk->sk_flags);
1826 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
1827 /* someone else will have to effect the close */
1828 return;
1829
1830 atomic_inc(&svsk->sk_inuse);
1831 svc_delete_socket(svsk);
1832 clear_bit(SK_BUSY, &svsk->sk_flags);
1833 svc_sock_put(svsk);
1834}
1835
1836void svc_force_close_socket(struct svc_sock *svsk)
1837{
1838 set_bit(SK_CLOSE, &svsk->sk_flags);
1839 if (test_bit(SK_BUSY, &svsk->sk_flags)) {
1840 /* Waiting to be processed, but no threads left,
1841 * So just remove it from the waiting list
1842 */
1843 list_del_init(&svsk->sk_ready);
1844 clear_bit(SK_BUSY, &svsk->sk_flags);
1845 }
1846 svc_close_socket(svsk);
1847}
1848
1849/**
1850 * svc_makesock - Make a socket for nfsd and lockd
1851 * @serv: RPC server structure
1852 * @protocol: transport protocol to use
1853 * @port: port to use
1854 * @flags: requested socket characteristics
1855 *
1856 */
1857int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
1858 int flags)
1859{
1860 struct sockaddr_in sin = {
1861 .sin_family = AF_INET,
1862 .sin_addr.s_addr = INADDR_ANY,
1863 .sin_port = htons(port),
1864 };
1865
1866 dprintk("svc: creating socket proto = %d\n", protocol);
1867 return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
1868 sizeof(sin), flags);
1869} 1273}
1870 1274
1871/* 1275/*
1872 * Handle defer and revisit of requests 1276 * Free the svc_sock's socket resources and the svc_sock itself.
1873 */ 1277 */
1874 1278static void svc_sock_free(struct svc_xprt *xprt)
1875static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
1876{ 1279{
1877 struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); 1280 struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1878 struct svc_sock *svsk; 1281 dprintk("svc: svc_sock_free(%p)\n", svsk);
1879 1282
1880 if (too_many) { 1283 if (svsk->sk_sock->file)
1881 svc_sock_put(dr->svsk); 1284 sockfd_put(svsk->sk_sock);
1882 kfree(dr); 1285 else
1883 return; 1286 sock_release(svsk->sk_sock);
1884 } 1287 kfree(svsk);
1885 dprintk("revisit queued\n");
1886 svsk = dr->svsk;
1887 dr->svsk = NULL;
1888 spin_lock(&svsk->sk_lock);
1889 list_add(&dr->handle.recent, &svsk->sk_deferred);
1890 spin_unlock(&svsk->sk_lock);
1891 set_bit(SK_DEFERRED, &svsk->sk_flags);
1892 svc_sock_enqueue(svsk);
1893 svc_sock_put(svsk);
1894}
1895
1896static struct cache_deferred_req *
1897svc_defer(struct cache_req *req)
1898{
1899 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
1900 int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
1901 struct svc_deferred_req *dr;
1902
1903 if (rqstp->rq_arg.page_len)
1904 return NULL; /* if more than a page, give up FIXME */
1905 if (rqstp->rq_deferred) {
1906 dr = rqstp->rq_deferred;
1907 rqstp->rq_deferred = NULL;
1908 } else {
1909 int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
1910 /* FIXME maybe discard if size too large */
1911 dr = kmalloc(size, GFP_KERNEL);
1912 if (dr == NULL)
1913 return NULL;
1914
1915 dr->handle.owner = rqstp->rq_server;
1916 dr->prot = rqstp->rq_prot;
1917 memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
1918 dr->addrlen = rqstp->rq_addrlen;
1919 dr->daddr = rqstp->rq_daddr;
1920 dr->argslen = rqstp->rq_arg.len >> 2;
1921 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
1922 }
1923 atomic_inc(&rqstp->rq_sock->sk_inuse);
1924 dr->svsk = rqstp->rq_sock;
1925
1926 dr->handle.revisit = svc_revisit;
1927 return &dr->handle;
1928}
1929
1930/*
1931 * recv data from a deferred request into an active one
1932 */
1933static int svc_deferred_recv(struct svc_rqst *rqstp)
1934{
1935 struct svc_deferred_req *dr = rqstp->rq_deferred;
1936
1937 rqstp->rq_arg.head[0].iov_base = dr->args;
1938 rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
1939 rqstp->rq_arg.page_len = 0;
1940 rqstp->rq_arg.len = dr->argslen<<2;
1941 rqstp->rq_prot = dr->prot;
1942 memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
1943 rqstp->rq_addrlen = dr->addrlen;
1944 rqstp->rq_daddr = dr->daddr;
1945 rqstp->rq_respages = rqstp->rq_pages;
1946 return dr->argslen<<2;
1947}
1948
1949
1950static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
1951{
1952 struct svc_deferred_req *dr = NULL;
1953
1954 if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
1955 return NULL;
1956 spin_lock(&svsk->sk_lock);
1957 clear_bit(SK_DEFERRED, &svsk->sk_flags);
1958 if (!list_empty(&svsk->sk_deferred)) {
1959 dr = list_entry(svsk->sk_deferred.next,
1960 struct svc_deferred_req,
1961 handle.recent);
1962 list_del_init(&dr->handle.recent);
1963 set_bit(SK_DEFERRED, &svsk->sk_flags);
1964 }
1965 spin_unlock(&svsk->sk_lock);
1966 return dr;
1967} 1288}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index bada7de0c2fc..0f8c439b848a 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -18,6 +18,7 @@
18#include <linux/sunrpc/types.h> 18#include <linux/sunrpc/types.h>
19#include <linux/sunrpc/sched.h> 19#include <linux/sunrpc/sched.h>
20#include <linux/sunrpc/stats.h> 20#include <linux/sunrpc/stats.h>
21#include <linux/sunrpc/svc_xprt.h>
21 22
22/* 23/*
23 * Declare the debug flags here 24 * Declare the debug flags here
@@ -55,6 +56,30 @@ rpc_unregister_sysctl(void)
55 } 56 }
56} 57}
57 58
59static int proc_do_xprt(ctl_table *table, int write, struct file *file,
60 void __user *buffer, size_t *lenp, loff_t *ppos)
61{
62 char tmpbuf[256];
63 int len;
64 if ((*ppos && !write) || !*lenp) {
65 *lenp = 0;
66 return 0;
67 }
68 if (write)
69 return -EINVAL;
70 else {
71 len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
72 if (!access_ok(VERIFY_WRITE, buffer, len))
73 return -EFAULT;
74
75 if (__copy_to_user(buffer, tmpbuf, len))
76 return -EFAULT;
77 }
78 *lenp -= len;
79 *ppos += len;
80 return 0;
81}
82
58static int 83static int
59proc_dodebug(ctl_table *table, int write, struct file *file, 84proc_dodebug(ctl_table *table, int write, struct file *file,
60 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -147,6 +172,12 @@ static ctl_table debug_table[] = {
147 .mode = 0644, 172 .mode = 0644,
148 .proc_handler = &proc_dodebug 173 .proc_handler = &proc_dodebug
149 }, 174 },
175 {
176 .procname = "transports",
177 .maxlen = 256,
178 .mode = 0444,
179 .proc_handler = &proc_do_xprt,
180 },
150 { .ctl_name = 0 } 181 { .ctl_name = 0 }
151}; 182};
152 183
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 54264062ea69..995c3fdc16c2 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string)
96EXPORT_SYMBOL(xdr_encode_string); 96EXPORT_SYMBOL(xdr_encode_string);
97 97
98__be32 * 98__be32 *
99xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) 99xdr_decode_string_inplace(__be32 *p, char **sp,
100 unsigned int *lenp, unsigned int maxlen)
100{ 101{
101 unsigned int len; 102 u32 len;
102 103
103 if ((len = ntohl(*p++)) > maxlen) 104 len = ntohl(*p++);
105 if (len > maxlen)
104 return NULL; 106 return NULL;
105 *lenp = len; 107 *lenp = len;
106 *sp = (char *) p; 108 *sp = (char *) p;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 264f0feeb513..5a8f268bdd30 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,3 +1,8 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o
4
5obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
6
7svcrdma-y := svc_rdma.o svc_rdma_transport.o \
8 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 000000000000..88c0ca20bb1e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,266 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41#include <linux/module.h>
42#include <linux/init.h>
43#include <linux/fs.h>
44#include <linux/sysctl.h>
45#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/sched.h>
47#include <linux/sunrpc/svc_rdma.h>
48
49#define RPCDBG_FACILITY RPCDBG_SVCXPRT
50
51/* RPC/RDMA parameters */
52unsigned int svcrdma_ord = RPCRDMA_ORD;
53static unsigned int min_ord = 1;
54static unsigned int max_ord = 4096;
55unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
56static unsigned int min_max_requests = 4;
57static unsigned int max_max_requests = 16384;
58unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
59static unsigned int min_max_inline = 4096;
60static unsigned int max_max_inline = 65536;
61
62atomic_t rdma_stat_recv;
63atomic_t rdma_stat_read;
64atomic_t rdma_stat_write;
65atomic_t rdma_stat_sq_starve;
66atomic_t rdma_stat_rq_starve;
67atomic_t rdma_stat_rq_poll;
68atomic_t rdma_stat_rq_prod;
69atomic_t rdma_stat_sq_poll;
70atomic_t rdma_stat_sq_prod;
71
72/*
73 * This function implements reading and resetting an atomic_t stat
74 * variable through read/write to a proc file. Any write to the file
75 * resets the associated statistic to zero. Any read returns it's
76 * current value.
77 */
78static int read_reset_stat(ctl_table *table, int write,
79 struct file *filp, void __user *buffer, size_t *lenp,
80 loff_t *ppos)
81{
82 atomic_t *stat = (atomic_t *)table->data;
83
84 if (!stat)
85 return -EINVAL;
86
87 if (write)
88 atomic_set(stat, 0);
89 else {
90 char str_buf[32];
91 char *data;
92 int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
93 if (len >= 32)
94 return -EFAULT;
95 len = strlen(str_buf);
96 if (*ppos > len) {
97 *lenp = 0;
98 return 0;
99 }
100 data = &str_buf[*ppos];
101 len -= *ppos;
102 if (len > *lenp)
103 len = *lenp;
104 if (len && copy_to_user(buffer, str_buf, len))
105 return -EFAULT;
106 *lenp = len;
107 *ppos += len;
108 }
109 return 0;
110}
111
112static struct ctl_table_header *svcrdma_table_header;
113static ctl_table svcrdma_parm_table[] = {
114 {
115 .procname = "max_requests",
116 .data = &svcrdma_max_requests,
117 .maxlen = sizeof(unsigned int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec_minmax,
120 .strategy = &sysctl_intvec,
121 .extra1 = &min_max_requests,
122 .extra2 = &max_max_requests
123 },
124 {
125 .procname = "max_req_size",
126 .data = &svcrdma_max_req_size,
127 .maxlen = sizeof(unsigned int),
128 .mode = 0644,
129 .proc_handler = &proc_dointvec_minmax,
130 .strategy = &sysctl_intvec,
131 .extra1 = &min_max_inline,
132 .extra2 = &max_max_inline
133 },
134 {
135 .procname = "max_outbound_read_requests",
136 .data = &svcrdma_ord,
137 .maxlen = sizeof(unsigned int),
138 .mode = 0644,
139 .proc_handler = &proc_dointvec_minmax,
140 .strategy = &sysctl_intvec,
141 .extra1 = &min_ord,
142 .extra2 = &max_ord,
143 },
144
145 {
146 .procname = "rdma_stat_read",
147 .data = &rdma_stat_read,
148 .maxlen = sizeof(atomic_t),
149 .mode = 0644,
150 .proc_handler = &read_reset_stat,
151 },
152 {
153 .procname = "rdma_stat_recv",
154 .data = &rdma_stat_recv,
155 .maxlen = sizeof(atomic_t),
156 .mode = 0644,
157 .proc_handler = &read_reset_stat,
158 },
159 {
160 .procname = "rdma_stat_write",
161 .data = &rdma_stat_write,
162 .maxlen = sizeof(atomic_t),
163 .mode = 0644,
164 .proc_handler = &read_reset_stat,
165 },
166 {
167 .procname = "rdma_stat_sq_starve",
168 .data = &rdma_stat_sq_starve,
169 .maxlen = sizeof(atomic_t),
170 .mode = 0644,
171 .proc_handler = &read_reset_stat,
172 },
173 {
174 .procname = "rdma_stat_rq_starve",
175 .data = &rdma_stat_rq_starve,
176 .maxlen = sizeof(atomic_t),
177 .mode = 0644,
178 .proc_handler = &read_reset_stat,
179 },
180 {
181 .procname = "rdma_stat_rq_poll",
182 .data = &rdma_stat_rq_poll,
183 .maxlen = sizeof(atomic_t),
184 .mode = 0644,
185 .proc_handler = &read_reset_stat,
186 },
187 {
188 .procname = "rdma_stat_rq_prod",
189 .data = &rdma_stat_rq_prod,
190 .maxlen = sizeof(atomic_t),
191 .mode = 0644,
192 .proc_handler = &read_reset_stat,
193 },
194 {
195 .procname = "rdma_stat_sq_poll",
196 .data = &rdma_stat_sq_poll,
197 .maxlen = sizeof(atomic_t),
198 .mode = 0644,
199 .proc_handler = &read_reset_stat,
200 },
201 {
202 .procname = "rdma_stat_sq_prod",
203 .data = &rdma_stat_sq_prod,
204 .maxlen = sizeof(atomic_t),
205 .mode = 0644,
206 .proc_handler = &read_reset_stat,
207 },
208 {
209 .ctl_name = 0,
210 },
211};
212
213static ctl_table svcrdma_table[] = {
214 {
215 .procname = "svc_rdma",
216 .mode = 0555,
217 .child = svcrdma_parm_table
218 },
219 {
220 .ctl_name = 0,
221 },
222};
223
224static ctl_table svcrdma_root_table[] = {
225 {
226 .ctl_name = CTL_SUNRPC,
227 .procname = "sunrpc",
228 .mode = 0555,
229 .child = svcrdma_table
230 },
231 {
232 .ctl_name = 0,
233 },
234};
235
236void svc_rdma_cleanup(void)
237{
238 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
239 if (svcrdma_table_header) {
240 unregister_sysctl_table(svcrdma_table_header);
241 svcrdma_table_header = NULL;
242 }
243 svc_unreg_xprt_class(&svc_rdma_class);
244}
245
246int svc_rdma_init(void)
247{
248 dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
249 dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
250 dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
251 dprintk("\tsq_depth : %d\n",
252 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
253 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
254 if (!svcrdma_table_header)
255 svcrdma_table_header =
256 register_sysctl_table(svcrdma_root_table);
257
258 /* Register RDMA with the SVC transport switch */
259 svc_reg_xprt_class(&svc_rdma_class);
260 return 0;
261}
262MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
263MODULE_DESCRIPTION("SVC RDMA Transport");
264MODULE_LICENSE("Dual BSD/GPL");
265module_init(svc_rdma_init);
266module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 000000000000..9530ef2d40dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,412 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/xdr.h>
43#include <linux/sunrpc/debug.h>
44#include <asm/unaligned.h>
45#include <linux/sunrpc/rpc_rdma.h>
46#include <linux/sunrpc/svc_rdma.h>
47
48#define RPCDBG_FACILITY RPCDBG_SVCXPRT
49
50/*
51 * Decodes a read chunk list. The expected format is as follows:
52 * descrim : xdr_one
53 * position : u32 offset into XDR stream
54 * handle : u32 RKEY
55 * . . .
56 * end-of-list: xdr_zero
57 */
58static u32 *decode_read_list(u32 *va, u32 *vaend)
59{
60 struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
61
62 while (ch->rc_discrim != xdr_zero) {
63 u64 ch_offset;
64
65 if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
66 (unsigned long)vaend) {
67 dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
68 return NULL;
69 }
70
71 ch->rc_discrim = ntohl(ch->rc_discrim);
72 ch->rc_position = ntohl(ch->rc_position);
73 ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
74 ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
75 va = (u32 *)&ch->rc_target.rs_offset;
76 xdr_decode_hyper(va, &ch_offset);
77 put_unaligned(ch_offset, (u64 *)va);
78 ch++;
79 }
80 return (u32 *)&ch->rc_position;
81}
82
83/*
84 * Determine number of chunks and total bytes in chunk list. The chunk
85 * list has already been verified to fit within the RPCRDMA header.
86 */
87void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
88 int *ch_count, int *byte_count)
89{
90 /* compute the number of bytes represented by read chunks */
91 *byte_count = 0;
92 *ch_count = 0;
93 for (; ch->rc_discrim != 0; ch++) {
94 *byte_count = *byte_count + ch->rc_target.rs_length;
95 *ch_count = *ch_count + 1;
96 }
97}
98
99/*
100 * Decodes a write chunk list. The expected format is as follows:
101 * descrim : xdr_one
102 * nchunks : <count>
103 * handle : u32 RKEY ---+
104 * length : u32 <len of segment> |
105 * offset : remove va + <count>
106 * . . . |
107 * ---+
108 */
109static u32 *decode_write_list(u32 *va, u32 *vaend)
110{
111 int ch_no;
112 struct rpcrdma_write_array *ary =
113 (struct rpcrdma_write_array *)va;
114
115 /* Check for not write-array */
116 if (ary->wc_discrim == xdr_zero)
117 return (u32 *)&ary->wc_nchunks;
118
119 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
120 (unsigned long)vaend) {
121 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
122 return NULL;
123 }
124 ary->wc_discrim = ntohl(ary->wc_discrim);
125 ary->wc_nchunks = ntohl(ary->wc_nchunks);
126 if (((unsigned long)&ary->wc_array[0] +
127 (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
128 (unsigned long)vaend) {
129 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
130 ary, ary->wc_nchunks, vaend);
131 return NULL;
132 }
133 for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
134 u64 ch_offset;
135
136 ary->wc_array[ch_no].wc_target.rs_handle =
137 ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
138 ary->wc_array[ch_no].wc_target.rs_length =
139 ntohl(ary->wc_array[ch_no].wc_target.rs_length);
140 va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
141 xdr_decode_hyper(va, &ch_offset);
142 put_unaligned(ch_offset, (u64 *)va);
143 }
144
145 /*
146 * rs_length is the 2nd 4B field in wc_target and taking its
147 * address skips the list terminator
148 */
149 return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
150}
151
152static u32 *decode_reply_array(u32 *va, u32 *vaend)
153{
154 int ch_no;
155 struct rpcrdma_write_array *ary =
156 (struct rpcrdma_write_array *)va;
157
158 /* Check for no reply-array */
159 if (ary->wc_discrim == xdr_zero)
160 return (u32 *)&ary->wc_nchunks;
161
162 if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
163 (unsigned long)vaend) {
164 dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
165 return NULL;
166 }
167 ary->wc_discrim = ntohl(ary->wc_discrim);
168 ary->wc_nchunks = ntohl(ary->wc_nchunks);
169 if (((unsigned long)&ary->wc_array[0] +
170 (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
171 (unsigned long)vaend) {
172 dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
173 ary, ary->wc_nchunks, vaend);
174 return NULL;
175 }
176 for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
177 u64 ch_offset;
178
179 ary->wc_array[ch_no].wc_target.rs_handle =
180 ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
181 ary->wc_array[ch_no].wc_target.rs_length =
182 ntohl(ary->wc_array[ch_no].wc_target.rs_length);
183 va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
184 xdr_decode_hyper(va, &ch_offset);
185 put_unaligned(ch_offset, (u64 *)va);
186 }
187
188 return (u32 *)&ary->wc_array[ch_no];
189}
190
191int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
192 struct svc_rqst *rqstp)
193{
194 struct rpcrdma_msg *rmsgp = NULL;
195 u32 *va;
196 u32 *vaend;
197 u32 hdr_len;
198
199 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
200
201 /* Verify that there's enough bytes for header + something */
202 if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
203 dprintk("svcrdma: header too short = %d\n",
204 rqstp->rq_arg.len);
205 return -EINVAL;
206 }
207
208 /* Decode the header */
209 rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
210 rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
211 rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
212 rmsgp->rm_type = ntohl(rmsgp->rm_type);
213
214 if (rmsgp->rm_vers != RPCRDMA_VERSION)
215 return -ENOSYS;
216
217 /* Pull in the extra for the padded case and bump our pointer */
218 if (rmsgp->rm_type == RDMA_MSGP) {
219 int hdrlen;
220 rmsgp->rm_body.rm_padded.rm_align =
221 ntohl(rmsgp->rm_body.rm_padded.rm_align);
222 rmsgp->rm_body.rm_padded.rm_thresh =
223 ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
224
225 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
226 rqstp->rq_arg.head[0].iov_base = va;
227 hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
228 rqstp->rq_arg.head[0].iov_len -= hdrlen;
229 if (hdrlen > rqstp->rq_arg.len)
230 return -EINVAL;
231 return hdrlen;
232 }
233
234 /* The chunk list may contain either a read chunk list or a write
235 * chunk list and a reply chunk list.
236 */
237 va = &rmsgp->rm_body.rm_chunks[0];
238 vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
239 va = decode_read_list(va, vaend);
240 if (!va)
241 return -EINVAL;
242 va = decode_write_list(va, vaend);
243 if (!va)
244 return -EINVAL;
245 va = decode_reply_array(va, vaend);
246 if (!va)
247 return -EINVAL;
248
249 rqstp->rq_arg.head[0].iov_base = va;
250 hdr_len = (unsigned long)va - (unsigned long)rmsgp;
251 rqstp->rq_arg.head[0].iov_len -= hdr_len;
252
253 *rdma_req = rmsgp;
254 return hdr_len;
255}
256
257int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
258{
259 struct rpcrdma_msg *rmsgp = NULL;
260 struct rpcrdma_read_chunk *ch;
261 struct rpcrdma_write_array *ary;
262 u32 *va;
263 u32 hdrlen;
264
265 dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
266 rqstp);
267 rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
268
269 /* Pull in the extra for the padded case and bump our pointer */
270 if (rmsgp->rm_type == RDMA_MSGP) {
271 va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
272 rqstp->rq_arg.head[0].iov_base = va;
273 hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
274 rqstp->rq_arg.head[0].iov_len -= hdrlen;
275 return hdrlen;
276 }
277
278 /*
279 * Skip all chunks to find RPC msg. These were previously processed
280 */
281 va = &rmsgp->rm_body.rm_chunks[0];
282
283 /* Skip read-list */
284 for (ch = (struct rpcrdma_read_chunk *)va;
285 ch->rc_discrim != xdr_zero; ch++);
286 va = (u32 *)&ch->rc_position;
287
288 /* Skip write-list */
289 ary = (struct rpcrdma_write_array *)va;
290 if (ary->wc_discrim == xdr_zero)
291 va = (u32 *)&ary->wc_nchunks;
292 else
293 /*
294 * rs_length is the 2nd 4B field in wc_target and taking its
295 * address skips the list terminator
296 */
297 va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
298
299 /* Skip reply-array */
300 ary = (struct rpcrdma_write_array *)va;
301 if (ary->wc_discrim == xdr_zero)
302 va = (u32 *)&ary->wc_nchunks;
303 else
304 va = (u32 *)&ary->wc_array[ary->wc_nchunks];
305
306 rqstp->rq_arg.head[0].iov_base = va;
307 hdrlen = (unsigned long)va - (unsigned long)rmsgp;
308 rqstp->rq_arg.head[0].iov_len -= hdrlen;
309
310 return hdrlen;
311}
312
313int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
314 struct rpcrdma_msg *rmsgp,
315 enum rpcrdma_errcode err, u32 *va)
316{
317 u32 *startp = va;
318
319 *va++ = htonl(rmsgp->rm_xid);
320 *va++ = htonl(rmsgp->rm_vers);
321 *va++ = htonl(xprt->sc_max_requests);
322 *va++ = htonl(RDMA_ERROR);
323 *va++ = htonl(err);
324 if (err == ERR_VERS) {
325 *va++ = htonl(RPCRDMA_VERSION);
326 *va++ = htonl(RPCRDMA_VERSION);
327 }
328
329 return (int)((unsigned long)va - (unsigned long)startp);
330}
331
332int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
333{
334 struct rpcrdma_write_array *wr_ary;
335
336 /* There is no read-list in a reply */
337
338 /* skip write list */
339 wr_ary = (struct rpcrdma_write_array *)
340 &rmsgp->rm_body.rm_chunks[1];
341 if (wr_ary->wc_discrim)
342 wr_ary = (struct rpcrdma_write_array *)
343 &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
344 wc_target.rs_length;
345 else
346 wr_ary = (struct rpcrdma_write_array *)
347 &wr_ary->wc_nchunks;
348
349 /* skip reply array */
350 if (wr_ary->wc_discrim)
351 wr_ary = (struct rpcrdma_write_array *)
352 &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
353 else
354 wr_ary = (struct rpcrdma_write_array *)
355 &wr_ary->wc_nchunks;
356
357 return (unsigned long) wr_ary - (unsigned long) rmsgp;
358}
359
360void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
361{
362 struct rpcrdma_write_array *ary;
363
364 /* no read-list */
365 rmsgp->rm_body.rm_chunks[0] = xdr_zero;
366
367 /* write-array discrim */
368 ary = (struct rpcrdma_write_array *)
369 &rmsgp->rm_body.rm_chunks[1];
370 ary->wc_discrim = xdr_one;
371 ary->wc_nchunks = htonl(chunks);
372
373 /* write-list terminator */
374 ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
375
376 /* reply-array discriminator */
377 ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
378}
379
380void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
381 int chunks)
382{
383 ary->wc_discrim = xdr_one;
384 ary->wc_nchunks = htonl(chunks);
385}
386
387void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
388 int chunk_no,
389 u32 rs_handle, u64 rs_offset,
390 u32 write_len)
391{
392 struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
393 seg->rs_handle = htonl(rs_handle);
394 seg->rs_length = htonl(write_len);
395 xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset);
396}
397
398void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
399 struct rpcrdma_msg *rdma_argp,
400 struct rpcrdma_msg *rdma_resp,
401 enum rpcrdma_proc rdma_type)
402{
403 rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
404 rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
405 rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
406 rdma_resp->rm_type = htonl(rdma_type);
407
408 /* Encode <nul> chunks lists */
409 rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
410 rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
411 rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
412}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 000000000000..ab54a736486e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52/*
53 * Replace the pages in the rq_argpages array with the pages from the SGE in
54 * the RDMA_RECV completion. The SGL should contain full pages up until the
55 * last one.
56 */
57static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
58 struct svc_rdma_op_ctxt *ctxt,
59 u32 byte_count)
60{
61 struct page *page;
62 u32 bc;
63 int sge_no;
64
65 /* Swap the page in the SGE with the page in argpages */
66 page = ctxt->pages[0];
67 put_page(rqstp->rq_pages[0]);
68 rqstp->rq_pages[0] = page;
69
70 /* Set up the XDR head */
71 rqstp->rq_arg.head[0].iov_base = page_address(page);
72 rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
73 rqstp->rq_arg.len = byte_count;
74 rqstp->rq_arg.buflen = byte_count;
75
76 /* Compute bytes past head in the SGL */
77 bc = byte_count - rqstp->rq_arg.head[0].iov_len;
78
79 /* If data remains, store it in the pagelist */
80 rqstp->rq_arg.page_len = bc;
81 rqstp->rq_arg.page_base = 0;
82 rqstp->rq_arg.pages = &rqstp->rq_pages[1];
83 sge_no = 1;
84 while (bc && sge_no < ctxt->count) {
85 page = ctxt->pages[sge_no];
86 put_page(rqstp->rq_pages[sge_no]);
87 rqstp->rq_pages[sge_no] = page;
88 bc -= min(bc, ctxt->sge[sge_no].length);
89 rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
90 sge_no++;
91 }
92 rqstp->rq_respages = &rqstp->rq_pages[sge_no];
93
94 /* We should never run out of SGE because the limit is defined to
95 * support the max allowed RPC data length
96 */
97 BUG_ON(bc && (sge_no == ctxt->count));
98 BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
99 != byte_count);
100 BUG_ON(rqstp->rq_arg.len != byte_count);
101
102 /* If not all pages were used from the SGL, free the remaining ones */
103 bc = sge_no;
104 while (sge_no < ctxt->count) {
105 page = ctxt->pages[sge_no++];
106 put_page(page);
107 }
108 ctxt->count = bc;
109
110 /* Set up tail */
111 rqstp->rq_arg.tail[0].iov_base = NULL;
112 rqstp->rq_arg.tail[0].iov_len = 0;
113}
114
115struct chunk_sge {
116 int start; /* sge no for this chunk */
117 int count; /* sge count for this chunk */
118};
119
120/* Encode a read-chunk-list as an array of IB SGE
121 *
122 * Assumptions:
123 * - chunk[0]->position points to pages[0] at an offset of 0
124 * - pages[] is not physically or virtually contigous and consists of
125 * PAGE_SIZE elements.
126 *
127 * Output:
128 * - sge array pointing into pages[] array.
129 * - chunk_sge array specifying sge index and count for each
130 * chunk in the read list
131 *
132 */
133static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
134 struct svc_rqst *rqstp,
135 struct svc_rdma_op_ctxt *head,
136 struct rpcrdma_msg *rmsgp,
137 struct ib_sge *sge,
138 struct chunk_sge *ch_sge_ary,
139 int ch_count,
140 int byte_count)
141{
142 int sge_no;
143 int sge_bytes;
144 int page_off;
145 int page_no;
146 int ch_bytes;
147 int ch_no;
148 struct rpcrdma_read_chunk *ch;
149
150 sge_no = 0;
151 page_no = 0;
152 page_off = 0;
153 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
154 ch_no = 0;
155 ch_bytes = ch->rc_target.rs_length;
156 head->arg.head[0] = rqstp->rq_arg.head[0];
157 head->arg.tail[0] = rqstp->rq_arg.tail[0];
158 head->arg.pages = &head->pages[head->count];
159 head->sge[0].length = head->count; /* save count of hdr pages */
160 head->arg.page_base = 0;
161 head->arg.page_len = ch_bytes;
162 head->arg.len = rqstp->rq_arg.len + ch_bytes;
163 head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
164 head->count++;
165 ch_sge_ary[0].start = 0;
166 while (byte_count) {
167 sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
168 sge[sge_no].addr =
169 ib_dma_map_page(xprt->sc_cm_id->device,
170 rqstp->rq_arg.pages[page_no],
171 page_off, sge_bytes,
172 DMA_FROM_DEVICE);
173 sge[sge_no].length = sge_bytes;
174 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
175 /*
176 * Don't bump head->count here because the same page
177 * may be used by multiple SGE.
178 */
179 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
180 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
181
182 byte_count -= sge_bytes;
183 ch_bytes -= sge_bytes;
184 sge_no++;
185 /*
186 * If all bytes for this chunk have been mapped to an
187 * SGE, move to the next SGE
188 */
189 if (ch_bytes == 0) {
190 ch_sge_ary[ch_no].count =
191 sge_no - ch_sge_ary[ch_no].start;
192 ch_no++;
193 ch++;
194 ch_sge_ary[ch_no].start = sge_no;
195 ch_bytes = ch->rc_target.rs_length;
196 /* If bytes remaining account for next chunk */
197 if (byte_count) {
198 head->arg.page_len += ch_bytes;
199 head->arg.len += ch_bytes;
200 head->arg.buflen += ch_bytes;
201 }
202 }
203 /*
204 * If this SGE consumed all of the page, move to the
205 * next page
206 */
207 if ((sge_bytes + page_off) == PAGE_SIZE) {
208 page_no++;
209 page_off = 0;
210 /*
211 * If there are still bytes left to map, bump
212 * the page count
213 */
214 if (byte_count)
215 head->count++;
216 } else
217 page_off += sge_bytes;
218 }
219 BUG_ON(byte_count != 0);
220 return sge_no;
221}
222
223static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
224 struct ib_sge *sge,
225 u64 *sgl_offset,
226 int count)
227{
228 int i;
229
230 ctxt->count = count;
231 for (i = 0; i < count; i++) {
232 ctxt->sge[i].addr = sge[i].addr;
233 ctxt->sge[i].length = sge[i].length;
234 *sgl_offset = *sgl_offset + sge[i].length;
235 }
236}
237
238static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
239{
240#ifdef RDMA_TRANSPORT_IWARP
241 if ((RDMA_TRANSPORT_IWARP ==
242 rdma_node_get_transport(xprt->sc_cm_id->
243 device->node_type))
244 && sge_count > 1)
245 return 1;
246 else
247#endif
248 return min_t(int, sge_count, xprt->sc_max_sge);
249}
250
251/*
252 * Use RDMA_READ to read data from the advertised client buffer into the
253 * XDR stream starting at rq_arg.head[0].iov_base.
254 * Each chunk in the array
255 * contains the following fields:
256 * discrim - '1', This isn't used for data placement
257 * position - The xdr stream offset (the same for every chunk)
258 * handle - RMR for client memory region
259 * length - data transfer length
260 * offset - 64 bit tagged offset in remote memory region
261 *
262 * On our side, we need to read into a pagelist. The first page immediately
263 * follows the RPC header.
264 *
265 * This function returns 1 to indicate success. The data is not yet in
266 * the pagelist and therefore the RPC request must be deferred. The
267 * I/O completion will enqueue the transport again and
268 * svc_rdma_recvfrom will complete the request.
269 *
270 * NOTE: The ctxt must not be touched after the last WR has been posted
271 * because the I/O completion processing may occur on another
272 * processor and free / modify the context. Ne touche pas!
273 */
274static int rdma_read_xdr(struct svcxprt_rdma *xprt,
275 struct rpcrdma_msg *rmsgp,
276 struct svc_rqst *rqstp,
277 struct svc_rdma_op_ctxt *hdr_ctxt)
278{
279 struct ib_send_wr read_wr;
280 int err = 0;
281 int ch_no;
282 struct ib_sge *sge;
283 int ch_count;
284 int byte_count;
285 int sge_count;
286 u64 sgl_offset;
287 struct rpcrdma_read_chunk *ch;
288 struct svc_rdma_op_ctxt *ctxt = NULL;
289 struct svc_rdma_op_ctxt *head;
290 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
291 struct svc_rdma_op_ctxt *tmp_ch_ctxt;
292 struct chunk_sge *ch_sge_ary;
293
294 /* If no read list is present, return 0 */
295 ch = svc_rdma_get_read_chunk(rmsgp);
296 if (!ch)
297 return 0;
298
299 /* Allocate temporary contexts to keep SGE */
300 BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
301 tmp_sge_ctxt = svc_rdma_get_context(xprt);
302 sge = tmp_sge_ctxt->sge;
303 tmp_ch_ctxt = svc_rdma_get_context(xprt);
304 ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
305
306 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
307 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
308 sge, ch_sge_ary,
309 ch_count, byte_count);
310 head = svc_rdma_get_context(xprt);
311 sgl_offset = 0;
312 ch_no = 0;
313
314 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
315 ch->rc_discrim != 0; ch++, ch_no++) {
316next_sge:
317 if (!ctxt)
318 ctxt = head;
319 else {
320 ctxt->next = svc_rdma_get_context(xprt);
321 ctxt = ctxt->next;
322 }
323 ctxt->next = NULL;
324 ctxt->direction = DMA_FROM_DEVICE;
325 clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
326 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
327 if ((ch+1)->rc_discrim == 0) {
328 /*
329 * Checked in sq_cq_reap to see if we need to
330 * be enqueued
331 */
332 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
333 ctxt->next = hdr_ctxt;
334 hdr_ctxt->next = head;
335 }
336
337 /* Prepare READ WR */
338 memset(&read_wr, 0, sizeof read_wr);
339 ctxt->wr_op = IB_WR_RDMA_READ;
340 read_wr.wr_id = (unsigned long)ctxt;
341 read_wr.opcode = IB_WR_RDMA_READ;
342 read_wr.send_flags = IB_SEND_SIGNALED;
343 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
344 read_wr.wr.rdma.remote_addr =
345 get_unaligned(&(ch->rc_target.rs_offset)) +
346 sgl_offset;
347 read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
348 read_wr.num_sge =
349 rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
350 rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
351 &sgl_offset,
352 read_wr.num_sge);
353
354 /* Post the read */
355 err = svc_rdma_send(xprt, &read_wr);
356 if (err) {
357 printk(KERN_ERR "svcrdma: Error posting send = %d\n",
358 err);
359 /*
360 * Break the circular list so free knows when
361 * to stop if the error happened to occur on
362 * the last read
363 */
364 ctxt->next = NULL;
365 goto out;
366 }
367 atomic_inc(&rdma_stat_read);
368
369 if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
370 ch_sge_ary[ch_no].count -= read_wr.num_sge;
371 ch_sge_ary[ch_no].start += read_wr.num_sge;
372 goto next_sge;
373 }
374 sgl_offset = 0;
375 err = 0;
376 }
377
378 out:
379 svc_rdma_put_context(tmp_sge_ctxt, 0);
380 svc_rdma_put_context(tmp_ch_ctxt, 0);
381
382 /* Detach arg pages. svc_recv will replenish them */
383 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
384 rqstp->rq_pages[ch_no] = NULL;
385
386 /*
387 * Detach res pages. svc_release must see a resused count of
388 * zero or it will attempt to put them.
389 */
390 while (rqstp->rq_resused)
391 rqstp->rq_respages[--rqstp->rq_resused] = NULL;
392
393 if (err) {
394 printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
395 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
396 /* Free the linked list of read contexts */
397 while (head != NULL) {
398 ctxt = head->next;
399 svc_rdma_put_context(head, 1);
400 head = ctxt;
401 }
402 return 0;
403 }
404
405 return 1;
406}
407
408static int rdma_read_complete(struct svc_rqst *rqstp,
409 struct svc_rdma_op_ctxt *data)
410{
411 struct svc_rdma_op_ctxt *head = data->next;
412 int page_no;
413 int ret;
414
415 BUG_ON(!head);
416
417 /* Copy RPC pages */
418 for (page_no = 0; page_no < head->count; page_no++) {
419 put_page(rqstp->rq_pages[page_no]);
420 rqstp->rq_pages[page_no] = head->pages[page_no];
421 }
422 /* Point rq_arg.pages past header */
423 rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
424 rqstp->rq_arg.page_len = head->arg.page_len;
425 rqstp->rq_arg.page_base = head->arg.page_base;
426
427 /* rq_respages starts after the last arg page */
428 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
429 rqstp->rq_resused = 0;
430
431 /* Rebuild rq_arg head and tail. */
432 rqstp->rq_arg.head[0] = head->arg.head[0];
433 rqstp->rq_arg.tail[0] = head->arg.tail[0];
434 rqstp->rq_arg.len = head->arg.len;
435 rqstp->rq_arg.buflen = head->arg.buflen;
436
437 /* XXX: What should this be? */
438 rqstp->rq_prot = IPPROTO_MAX;
439
440 /*
441 * Free the contexts we used to build the RDMA_READ. We have
442 * to be careful here because the context list uses the same
443 * next pointer used to chain the contexts associated with the
444 * RDMA_READ
445 */
446 data->next = NULL; /* terminate circular list */
447 do {
448 data = head->next;
449 svc_rdma_put_context(head, 0);
450 head = data;
451 } while (head != NULL);
452
453 ret = rqstp->rq_arg.head[0].iov_len
454 + rqstp->rq_arg.page_len
455 + rqstp->rq_arg.tail[0].iov_len;
456 dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
457 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
458 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
459 rqstp->rq_arg.head[0].iov_len);
460
461 /* Indicate that we've consumed an RQ credit */
462 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
463 svc_xprt_received(rqstp->rq_xprt);
464 return ret;
465}
466
467/*
468 * Set up the rqstp thread context to point to the RQ buffer. If
469 * necessary, pull additional data from the client with an RDMA_READ
470 * request.
471 */
472int svc_rdma_recvfrom(struct svc_rqst *rqstp)
473{
474 struct svc_xprt *xprt = rqstp->rq_xprt;
475 struct svcxprt_rdma *rdma_xprt =
476 container_of(xprt, struct svcxprt_rdma, sc_xprt);
477 struct svc_rdma_op_ctxt *ctxt = NULL;
478 struct rpcrdma_msg *rmsgp;
479 int ret = 0;
480 int len;
481
482 dprintk("svcrdma: rqstp=%p\n", rqstp);
483
484 /*
485 * The rq_xprt_ctxt indicates if we've consumed an RQ credit
486 * or not. It is used in the rdma xpo_release_rqst function to
487 * determine whether or not to return an RQ WQE to the RQ.
488 */
489 rqstp->rq_xprt_ctxt = NULL;
490
491 spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
492 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
493 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
494 struct svc_rdma_op_ctxt,
495 dto_q);
496 list_del_init(&ctxt->dto_q);
497 }
498 spin_unlock_bh(&rdma_xprt->sc_read_complete_lock);
499 if (ctxt)
500 return rdma_read_complete(rqstp, ctxt);
501
502 spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
503 if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
504 ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
505 struct svc_rdma_op_ctxt,
506 dto_q);
507 list_del_init(&ctxt->dto_q);
508 } else {
509 atomic_inc(&rdma_stat_rq_starve);
510 clear_bit(XPT_DATA, &xprt->xpt_flags);
511 ctxt = NULL;
512 }
513 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
514 if (!ctxt) {
515 /* This is the EAGAIN path. The svc_recv routine will
516 * return -EAGAIN, the nfsd thread will go to call into
517 * svc_recv again and we shouldn't be on the active
518 * transport list
519 */
520 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
521 goto close_out;
522
523 BUG_ON(ret);
524 goto out;
525 }
526 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
527 ctxt, rdma_xprt, rqstp, ctxt->wc_status);
528 BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
529 atomic_inc(&rdma_stat_recv);
530
531 /* Build up the XDR from the receive buffers. */
532 rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
533
534 /* Decode the RDMA header. */
535 len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
536 rqstp->rq_xprt_hlen = len;
537
538 /* If the request is invalid, reply with an error */
539 if (len < 0) {
540 if (len == -ENOSYS)
541 (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
542 goto close_out;
543 }
544
545 /* Read read-list data. If we would need to wait, defer
546 * it. Not that in this case, we don't return the RQ credit
547 * until after the read completes.
548 */
549 if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
550 svc_xprt_received(xprt);
551 return 0;
552 }
553
554 /* Indicate we've consumed an RQ credit */
555 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
556
557 ret = rqstp->rq_arg.head[0].iov_len
558 + rqstp->rq_arg.page_len
559 + rqstp->rq_arg.tail[0].iov_len;
560 svc_rdma_put_context(ctxt, 0);
561 out:
562 dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
563 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
564 ret, rqstp->rq_arg.len,
565 rqstp->rq_arg.head[0].iov_base,
566 rqstp->rq_arg.head[0].iov_len);
567 rqstp->rq_prot = IPPROTO_MAX;
568 svc_xprt_copy_addrs(rqstp, xprt);
569 svc_xprt_received(xprt);
570 return ret;
571
572 close_out:
573 if (ctxt) {
574 svc_rdma_put_context(ctxt, 1);
575 /* Indicate we've consumed an RQ credit */
576 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
577 }
578 dprintk("svcrdma: transport %p is closing\n", xprt);
579 /*
580 * Set the close bit and enqueue it. svc_recv will see the
581 * close bit and call svc_xprt_delete
582 */
583 set_bit(XPT_CLOSE, &xprt->xpt_flags);
584 svc_xprt_received(xprt);
585 return 0;
586}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 000000000000..3e321949e1dc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,520 @@
1/*
2 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52/* Encode an XDR as an array of IB SGE
53 *
54 * Assumptions:
55 * - head[0] is physically contiguous.
56 * - tail[0] is physically contiguous.
57 * - pages[] is not physically or virtually contigous and consists of
58 * PAGE_SIZE elements.
59 *
60 * Output:
61 * SGE[0] reserved for RCPRDMA header
62 * SGE[1] data from xdr->head[]
63 * SGE[2..sge_count-2] data from xdr->pages[]
64 * SGE[sge_count-1] data from xdr->tail.
65 *
66 */
67static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
68 struct xdr_buf *xdr,
69 struct ib_sge *sge,
70 int *sge_count)
71{
72 /* Max we need is the length of the XDR / pagesize + one for
73 * head + one for tail + one for RPCRDMA header
74 */
75 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
76 int sge_no;
77 u32 byte_count = xdr->len;
78 u32 sge_bytes;
79 u32 page_bytes;
80 int page_off;
81 int page_no;
82
83 /* Skip the first sge, this is for the RPCRDMA header */
84 sge_no = 1;
85
86 /* Head SGE */
87 sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
88 xdr->head[0].iov_base,
89 xdr->head[0].iov_len,
90 DMA_TO_DEVICE);
91 sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
92 byte_count -= sge_bytes;
93 sge[sge_no].length = sge_bytes;
94 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
95 sge_no++;
96
97 /* pages SGE */
98 page_no = 0;
99 page_bytes = xdr->page_len;
100 page_off = xdr->page_base;
101 while (byte_count && page_bytes) {
102 sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
103 sge[sge_no].addr =
104 ib_dma_map_page(xprt->sc_cm_id->device,
105 xdr->pages[page_no], page_off,
106 sge_bytes, DMA_TO_DEVICE);
107 sge_bytes = min(sge_bytes, page_bytes);
108 byte_count -= sge_bytes;
109 page_bytes -= sge_bytes;
110 sge[sge_no].length = sge_bytes;
111 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
112
113 sge_no++;
114 page_no++;
115 page_off = 0; /* reset for next time through loop */
116 }
117
118 /* Tail SGE */
119 if (byte_count && xdr->tail[0].iov_len) {
120 sge[sge_no].addr =
121 ib_dma_map_single(xprt->sc_cm_id->device,
122 xdr->tail[0].iov_base,
123 xdr->tail[0].iov_len,
124 DMA_TO_DEVICE);
125 sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
126 byte_count -= sge_bytes;
127 sge[sge_no].length = sge_bytes;
128 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
129 sge_no++;
130 }
131
132 BUG_ON(sge_no > sge_max);
133 BUG_ON(byte_count != 0);
134
135 *sge_count = sge_no;
136 return sge;
137}
138
139
140/* Assumptions:
141 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
142 */
143static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
144 u32 rmr, u64 to,
145 u32 xdr_off, int write_len,
146 struct ib_sge *xdr_sge, int sge_count)
147{
148 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
149 struct ib_send_wr write_wr;
150 struct ib_sge *sge;
151 int xdr_sge_no;
152 int sge_no;
153 int sge_bytes;
154 int sge_off;
155 int bc;
156 struct svc_rdma_op_ctxt *ctxt;
157 int ret = 0;
158
159 BUG_ON(sge_count >= 32);
160 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
161 "write_len=%d, xdr_sge=%p, sge_count=%d\n",
162 rmr, to, xdr_off, write_len, xdr_sge, sge_count);
163
164 ctxt = svc_rdma_get_context(xprt);
165 ctxt->count = 0;
166 tmp_sge_ctxt = svc_rdma_get_context(xprt);
167 sge = tmp_sge_ctxt->sge;
168
169 /* Find the SGE associated with xdr_off */
170 for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
171 xdr_sge_no++) {
172 if (xdr_sge[xdr_sge_no].length > bc)
173 break;
174 bc -= xdr_sge[xdr_sge_no].length;
175 }
176
177 sge_off = bc;
178 bc = write_len;
179 sge_no = 0;
180
181 /* Copy the remaining SGE */
182 while (bc != 0 && xdr_sge_no < sge_count) {
183 sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
184 sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
185 sge_bytes = min((size_t)bc,
186 (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
187 sge[sge_no].length = sge_bytes;
188
189 sge_off = 0;
190 sge_no++;
191 xdr_sge_no++;
192 bc -= sge_bytes;
193 }
194
195 BUG_ON(bc != 0);
196 BUG_ON(xdr_sge_no > sge_count);
197
198 /* Prepare WRITE WR */
199 memset(&write_wr, 0, sizeof write_wr);
200 ctxt->wr_op = IB_WR_RDMA_WRITE;
201 write_wr.wr_id = (unsigned long)ctxt;
202 write_wr.sg_list = &sge[0];
203 write_wr.num_sge = sge_no;
204 write_wr.opcode = IB_WR_RDMA_WRITE;
205 write_wr.send_flags = IB_SEND_SIGNALED;
206 write_wr.wr.rdma.rkey = rmr;
207 write_wr.wr.rdma.remote_addr = to;
208
209 /* Post It */
210 atomic_inc(&rdma_stat_write);
211 if (svc_rdma_send(xprt, &write_wr)) {
212 svc_rdma_put_context(ctxt, 1);
213 /* Fatal error, close transport */
214 ret = -EIO;
215 }
216 svc_rdma_put_context(tmp_sge_ctxt, 0);
217 return ret;
218}
219
220static int send_write_chunks(struct svcxprt_rdma *xprt,
221 struct rpcrdma_msg *rdma_argp,
222 struct rpcrdma_msg *rdma_resp,
223 struct svc_rqst *rqstp,
224 struct ib_sge *sge,
225 int sge_count)
226{
227 u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
228 int write_len;
229 int max_write;
230 u32 xdr_off;
231 int chunk_off;
232 int chunk_no;
233 struct rpcrdma_write_array *arg_ary;
234 struct rpcrdma_write_array *res_ary;
235 int ret;
236
237 arg_ary = svc_rdma_get_write_array(rdma_argp);
238 if (!arg_ary)
239 return 0;
240 res_ary = (struct rpcrdma_write_array *)
241 &rdma_resp->rm_body.rm_chunks[1];
242
243 max_write = xprt->sc_max_sge * PAGE_SIZE;
244
245 /* Write chunks start at the pagelist */
246 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
247 xfer_len && chunk_no < arg_ary->wc_nchunks;
248 chunk_no++) {
249 struct rpcrdma_segment *arg_ch;
250 u64 rs_offset;
251
252 arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
253 write_len = min(xfer_len, arg_ch->rs_length);
254
255 /* Prepare the response chunk given the length actually
256 * written */
257 rs_offset = get_unaligned(&(arg_ch->rs_offset));
258 svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
259 arg_ch->rs_handle,
260 rs_offset,
261 write_len);
262 chunk_off = 0;
263 while (write_len) {
264 int this_write;
265 this_write = min(write_len, max_write);
266 ret = send_write(xprt, rqstp,
267 arg_ch->rs_handle,
268 rs_offset + chunk_off,
269 xdr_off,
270 this_write,
271 sge,
272 sge_count);
273 if (ret) {
274 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
275 ret);
276 return -EIO;
277 }
278 chunk_off += this_write;
279 xdr_off += this_write;
280 xfer_len -= this_write;
281 write_len -= this_write;
282 }
283 }
284 /* Update the req with the number of chunks actually used */
285 svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
286
287 return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
288}
289
290static int send_reply_chunks(struct svcxprt_rdma *xprt,
291 struct rpcrdma_msg *rdma_argp,
292 struct rpcrdma_msg *rdma_resp,
293 struct svc_rqst *rqstp,
294 struct ib_sge *sge,
295 int sge_count)
296{
297 u32 xfer_len = rqstp->rq_res.len;
298 int write_len;
299 int max_write;
300 u32 xdr_off;
301 int chunk_no;
302 int chunk_off;
303 struct rpcrdma_segment *ch;
304 struct rpcrdma_write_array *arg_ary;
305 struct rpcrdma_write_array *res_ary;
306 int ret;
307
308 arg_ary = svc_rdma_get_reply_array(rdma_argp);
309 if (!arg_ary)
310 return 0;
311 /* XXX: need to fix when reply lists occur with read-list and or
312 * write-list */
313 res_ary = (struct rpcrdma_write_array *)
314 &rdma_resp->rm_body.rm_chunks[2];
315
316 max_write = xprt->sc_max_sge * PAGE_SIZE;
317
318 /* xdr offset starts at RPC message */
319 for (xdr_off = 0, chunk_no = 0;
320 xfer_len && chunk_no < arg_ary->wc_nchunks;
321 chunk_no++) {
322 u64 rs_offset;
323 ch = &arg_ary->wc_array[chunk_no].wc_target;
324 write_len = min(xfer_len, ch->rs_length);
325
326
327 /* Prepare the reply chunk given the length actually
328 * written */
329 rs_offset = get_unaligned(&(ch->rs_offset));
330 svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
331 ch->rs_handle, rs_offset,
332 write_len);
333 chunk_off = 0;
334 while (write_len) {
335 int this_write;
336
337 this_write = min(write_len, max_write);
338 ret = send_write(xprt, rqstp,
339 ch->rs_handle,
340 rs_offset + chunk_off,
341 xdr_off,
342 this_write,
343 sge,
344 sge_count);
345 if (ret) {
346 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
347 ret);
348 return -EIO;
349 }
350 chunk_off += this_write;
351 xdr_off += this_write;
352 xfer_len -= this_write;
353 write_len -= this_write;
354 }
355 }
356 /* Update the req with the number of chunks actually used */
357 svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
358
359 return rqstp->rq_res.len;
360}
361
362/* This function prepares the portion of the RPCRDMA message to be
363 * sent in the RDMA_SEND. This function is called after data sent via
364 * RDMA has already been transmitted. There are three cases:
365 * - The RPCRDMA header, RPC header, and payload are all sent in a
366 * single RDMA_SEND. This is the "inline" case.
367 * - The RPCRDMA header and some portion of the RPC header and data
368 * are sent via this RDMA_SEND and another portion of the data is
369 * sent via RDMA.
370 * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
371 * header and data are all transmitted via RDMA.
372 * In all three cases, this function prepares the RPCRDMA header in
373 * sge[0], the 'type' parameter indicates the type to place in the
374 * RPCRDMA header, and the 'byte_count' field indicates how much of
375 * the XDR to include in this RDMA_SEND.
376 */
377static int send_reply(struct svcxprt_rdma *rdma,
378 struct svc_rqst *rqstp,
379 struct page *page,
380 struct rpcrdma_msg *rdma_resp,
381 struct svc_rdma_op_ctxt *ctxt,
382 int sge_count,
383 int byte_count)
384{
385 struct ib_send_wr send_wr;
386 int sge_no;
387 int sge_bytes;
388 int page_no;
389 int ret;
390
391 /* Prepare the context */
392 ctxt->pages[0] = page;
393 ctxt->count = 1;
394
395 /* Prepare the SGE for the RPCRDMA Header */
396 ctxt->sge[0].addr =
397 ib_dma_map_page(rdma->sc_cm_id->device,
398 page, 0, PAGE_SIZE, DMA_TO_DEVICE);
399 ctxt->direction = DMA_TO_DEVICE;
400 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
401 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
402
403 /* Determine how many of our SGE are to be transmitted */
404 for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
405 sge_bytes = min((size_t)ctxt->sge[sge_no].length,
406 (size_t)byte_count);
407 byte_count -= sge_bytes;
408 }
409 BUG_ON(byte_count != 0);
410
411 /* Save all respages in the ctxt and remove them from the
412 * respages array. They are our pages until the I/O
413 * completes.
414 */
415 for (page_no = 0; page_no < rqstp->rq_resused; page_no++) {
416 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
417 ctxt->count++;
418 rqstp->rq_respages[page_no] = NULL;
419 }
420
421 BUG_ON(sge_no > rdma->sc_max_sge);
422 memset(&send_wr, 0, sizeof send_wr);
423 ctxt->wr_op = IB_WR_SEND;
424 send_wr.wr_id = (unsigned long)ctxt;
425 send_wr.sg_list = ctxt->sge;
426 send_wr.num_sge = sge_no;
427 send_wr.opcode = IB_WR_SEND;
428 send_wr.send_flags = IB_SEND_SIGNALED;
429
430 ret = svc_rdma_send(rdma, &send_wr);
431 if (ret)
432 svc_rdma_put_context(ctxt, 1);
433
434 return ret;
435}
436
437void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
438{
439}
440
441/*
442 * Return the start of an xdr buffer.
443 */
444static void *xdr_start(struct xdr_buf *xdr)
445{
446 return xdr->head[0].iov_base -
447 (xdr->len -
448 xdr->page_len -
449 xdr->tail[0].iov_len -
450 xdr->head[0].iov_len);
451}
452
453int svc_rdma_sendto(struct svc_rqst *rqstp)
454{
455 struct svc_xprt *xprt = rqstp->rq_xprt;
456 struct svcxprt_rdma *rdma =
457 container_of(xprt, struct svcxprt_rdma, sc_xprt);
458 struct rpcrdma_msg *rdma_argp;
459 struct rpcrdma_msg *rdma_resp;
460 struct rpcrdma_write_array *reply_ary;
461 enum rpcrdma_proc reply_type;
462 int ret;
463 int inline_bytes;
464 struct ib_sge *sge;
465 int sge_count = 0;
466 struct page *res_page;
467 struct svc_rdma_op_ctxt *ctxt;
468
469 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
470
471 /* Get the RDMA request header. */
472 rdma_argp = xdr_start(&rqstp->rq_arg);
473
474 /* Build an SGE for the XDR */
475 ctxt = svc_rdma_get_context(rdma);
476 ctxt->direction = DMA_TO_DEVICE;
477 sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
478
479 inline_bytes = rqstp->rq_res.len;
480
481 /* Create the RDMA response header */
482 res_page = svc_rdma_get_page();
483 rdma_resp = page_address(res_page);
484 reply_ary = svc_rdma_get_reply_array(rdma_argp);
485 if (reply_ary)
486 reply_type = RDMA_NOMSG;
487 else
488 reply_type = RDMA_MSG;
489 svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
490 rdma_resp, reply_type);
491
492 /* Send any write-chunk data and build resp write-list */
493 ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
494 rqstp, sge, sge_count);
495 if (ret < 0) {
496 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
497 ret);
498 goto error;
499 }
500 inline_bytes -= ret;
501
502 /* Send any reply-list data and update resp reply-list */
503 ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
504 rqstp, sge, sge_count);
505 if (ret < 0) {
506 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
507 ret);
508 goto error;
509 }
510 inline_bytes -= ret;
511
512 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
513 inline_bytes);
514 dprintk("svcrdma: send_reply returns %d\n", ret);
515 return ret;
516 error:
517 svc_rdma_put_context(ctxt, 0);
518 put_page(res_page);
519 return ret;
520}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 000000000000..f09444c451bc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1080 @@
1/*
2 * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * Author: Tom Tucker <tom@opengridcomputing.com>
40 */
41
42#include <linux/sunrpc/svc_xprt.h>
43#include <linux/sunrpc/debug.h>
44#include <linux/sunrpc/rpc_rdma.h>
45#include <linux/spinlock.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
53 struct sockaddr *sa, int salen,
54 int flags);
55static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
56static void svc_rdma_release_rqst(struct svc_rqst *);
57static void rdma_destroy_xprt(struct svcxprt_rdma *xprt);
58static void dto_tasklet_func(unsigned long data);
59static void svc_rdma_detach(struct svc_xprt *xprt);
60static void svc_rdma_free(struct svc_xprt *xprt);
61static int svc_rdma_has_wspace(struct svc_xprt *xprt);
62static void rq_cq_reap(struct svcxprt_rdma *xprt);
63static void sq_cq_reap(struct svcxprt_rdma *xprt);
64
65DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
66static DEFINE_SPINLOCK(dto_lock);
67static LIST_HEAD(dto_xprt_q);
68
69static struct svc_xprt_ops svc_rdma_ops = {
70 .xpo_create = svc_rdma_create,
71 .xpo_recvfrom = svc_rdma_recvfrom,
72 .xpo_sendto = svc_rdma_sendto,
73 .xpo_release_rqst = svc_rdma_release_rqst,
74 .xpo_detach = svc_rdma_detach,
75 .xpo_free = svc_rdma_free,
76 .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
77 .xpo_has_wspace = svc_rdma_has_wspace,
78 .xpo_accept = svc_rdma_accept,
79};
80
81struct svc_xprt_class svc_rdma_class = {
82 .xcl_name = "rdma",
83 .xcl_owner = THIS_MODULE,
84 .xcl_ops = &svc_rdma_ops,
85 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
86};
87
88static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
89{
90 int target;
91 int at_least_one = 0;
92 struct svc_rdma_op_ctxt *ctxt;
93
94 target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
95 xprt->sc_ctxt_max);
96
97 spin_lock_bh(&xprt->sc_ctxt_lock);
98 while (xprt->sc_ctxt_cnt < target) {
99 xprt->sc_ctxt_cnt++;
100 spin_unlock_bh(&xprt->sc_ctxt_lock);
101
102 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
103
104 spin_lock_bh(&xprt->sc_ctxt_lock);
105 if (ctxt) {
106 at_least_one = 1;
107 ctxt->next = xprt->sc_ctxt_head;
108 xprt->sc_ctxt_head = ctxt;
109 } else {
110 /* kmalloc failed...give up for now */
111 xprt->sc_ctxt_cnt--;
112 break;
113 }
114 }
115 spin_unlock_bh(&xprt->sc_ctxt_lock);
116 dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
117 xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
118 return at_least_one;
119}
120
121struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
122{
123 struct svc_rdma_op_ctxt *ctxt;
124
125 while (1) {
126 spin_lock_bh(&xprt->sc_ctxt_lock);
127 if (unlikely(xprt->sc_ctxt_head == NULL)) {
128 /* Try to bump my cache. */
129 spin_unlock_bh(&xprt->sc_ctxt_lock);
130
131 if (rdma_bump_context_cache(xprt))
132 continue;
133
134 printk(KERN_INFO "svcrdma: sleeping waiting for "
135 "context memory on xprt=%p\n",
136 xprt);
137 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
138 continue;
139 }
140 ctxt = xprt->sc_ctxt_head;
141 xprt->sc_ctxt_head = ctxt->next;
142 spin_unlock_bh(&xprt->sc_ctxt_lock);
143 ctxt->xprt = xprt;
144 INIT_LIST_HEAD(&ctxt->dto_q);
145 ctxt->count = 0;
146 break;
147 }
148 return ctxt;
149}
150
151void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
152{
153 struct svcxprt_rdma *xprt;
154 int i;
155
156 BUG_ON(!ctxt);
157 xprt = ctxt->xprt;
158 if (free_pages)
159 for (i = 0; i < ctxt->count; i++)
160 put_page(ctxt->pages[i]);
161
162 for (i = 0; i < ctxt->count; i++)
163 dma_unmap_single(xprt->sc_cm_id->device->dma_device,
164 ctxt->sge[i].addr,
165 ctxt->sge[i].length,
166 ctxt->direction);
167 spin_lock_bh(&xprt->sc_ctxt_lock);
168 ctxt->next = xprt->sc_ctxt_head;
169 xprt->sc_ctxt_head = ctxt;
170 spin_unlock_bh(&xprt->sc_ctxt_lock);
171}
172
173/* ib_cq event handler */
174static void cq_event_handler(struct ib_event *event, void *context)
175{
176 struct svc_xprt *xprt = context;
177 dprintk("svcrdma: received CQ event id=%d, context=%p\n",
178 event->event, context);
179 set_bit(XPT_CLOSE, &xprt->xpt_flags);
180}
181
182/* QP event handler */
183static void qp_event_handler(struct ib_event *event, void *context)
184{
185 struct svc_xprt *xprt = context;
186
187 switch (event->event) {
188 /* These are considered benign events */
189 case IB_EVENT_PATH_MIG:
190 case IB_EVENT_COMM_EST:
191 case IB_EVENT_SQ_DRAINED:
192 case IB_EVENT_QP_LAST_WQE_REACHED:
193 dprintk("svcrdma: QP event %d received for QP=%p\n",
194 event->event, event->element.qp);
195 break;
196 /* These are considered fatal events */
197 case IB_EVENT_PATH_MIG_ERR:
198 case IB_EVENT_QP_FATAL:
199 case IB_EVENT_QP_REQ_ERR:
200 case IB_EVENT_QP_ACCESS_ERR:
201 case IB_EVENT_DEVICE_FATAL:
202 default:
203 dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
204 "closing transport\n",
205 event->event, event->element.qp);
206 set_bit(XPT_CLOSE, &xprt->xpt_flags);
207 break;
208 }
209}
210
211/*
212 * Data Transfer Operation Tasklet
213 *
214 * Walks a list of transports with I/O pending, removing entries as
215 * they are added to the server's I/O pending list. Two bits indicate
216 * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
217 * spinlock that serializes access to the transport list with the RQ
218 * and SQ interrupt handlers.
219 */
220static void dto_tasklet_func(unsigned long data)
221{
222 struct svcxprt_rdma *xprt;
223 unsigned long flags;
224
225 spin_lock_irqsave(&dto_lock, flags);
226 while (!list_empty(&dto_xprt_q)) {
227 xprt = list_entry(dto_xprt_q.next,
228 struct svcxprt_rdma, sc_dto_q);
229 list_del_init(&xprt->sc_dto_q);
230 spin_unlock_irqrestore(&dto_lock, flags);
231
232 if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
233 ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
234 rq_cq_reap(xprt);
235 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
236 /*
237 * If data arrived before established event,
238 * don't enqueue. This defers RPC I/O until the
239 * RDMA connection is complete.
240 */
241 if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
242 svc_xprt_enqueue(&xprt->sc_xprt);
243 }
244
245 if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
246 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
247 sq_cq_reap(xprt);
248 }
249
250 spin_lock_irqsave(&dto_lock, flags);
251 }
252 spin_unlock_irqrestore(&dto_lock, flags);
253}
254
255/*
256 * Receive Queue Completion Handler
257 *
258 * Since an RQ completion handler is called on interrupt context, we
259 * need to defer the handling of the I/O to a tasklet
260 */
261static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
262{
263 struct svcxprt_rdma *xprt = cq_context;
264 unsigned long flags;
265
266 /*
267 * Set the bit regardless of whether or not it's on the list
268 * because it may be on the list already due to an SQ
269 * completion.
270 */
271 set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
272
273 /*
274 * If this transport is not already on the DTO transport queue,
275 * add it
276 */
277 spin_lock_irqsave(&dto_lock, flags);
278 if (list_empty(&xprt->sc_dto_q))
279 list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
280 spin_unlock_irqrestore(&dto_lock, flags);
281
282 /* Tasklet does all the work to avoid irqsave locks. */
283 tasklet_schedule(&dto_tasklet);
284}
285
286/*
287 * rq_cq_reap - Process the RQ CQ.
288 *
289 * Take all completing WC off the CQE and enqueue the associated DTO
290 * context on the dto_q for the transport.
291 */
292static void rq_cq_reap(struct svcxprt_rdma *xprt)
293{
294 int ret;
295 struct ib_wc wc;
296 struct svc_rdma_op_ctxt *ctxt = NULL;
297
298 atomic_inc(&rdma_stat_rq_poll);
299
300 spin_lock_bh(&xprt->sc_rq_dto_lock);
301 while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
302 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
303 ctxt->wc_status = wc.status;
304 ctxt->byte_len = wc.byte_len;
305 if (wc.status != IB_WC_SUCCESS) {
306 /* Close the transport */
307 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
308 svc_rdma_put_context(ctxt, 1);
309 continue;
310 }
311 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
312 }
313 spin_unlock_bh(&xprt->sc_rq_dto_lock);
314
315 if (ctxt)
316 atomic_inc(&rdma_stat_rq_prod);
317}
318
319/*
320 * Send Queue Completion Handler - potentially called on interrupt context.
321 */
322static void sq_cq_reap(struct svcxprt_rdma *xprt)
323{
324 struct svc_rdma_op_ctxt *ctxt = NULL;
325 struct ib_wc wc;
326 struct ib_cq *cq = xprt->sc_sq_cq;
327 int ret;
328
329 atomic_inc(&rdma_stat_sq_poll);
330 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
331 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
332 xprt = ctxt->xprt;
333
334 if (wc.status != IB_WC_SUCCESS)
335 /* Close the transport */
336 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
337
338 /* Decrement used SQ WR count */
339 atomic_dec(&xprt->sc_sq_count);
340 wake_up(&xprt->sc_send_wait);
341
342 switch (ctxt->wr_op) {
343 case IB_WR_SEND:
344 case IB_WR_RDMA_WRITE:
345 svc_rdma_put_context(ctxt, 1);
346 break;
347
348 case IB_WR_RDMA_READ:
349 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
350 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
351 set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
352 spin_lock_bh(&xprt->sc_read_complete_lock);
353 list_add_tail(&ctxt->dto_q,
354 &xprt->sc_read_complete_q);
355 spin_unlock_bh(&xprt->sc_read_complete_lock);
356 svc_xprt_enqueue(&xprt->sc_xprt);
357 }
358 break;
359
360 default:
361 printk(KERN_ERR "svcrdma: unexpected completion type, "
362 "opcode=%d, status=%d\n",
363 wc.opcode, wc.status);
364 break;
365 }
366 }
367
368 if (ctxt)
369 atomic_inc(&rdma_stat_sq_prod);
370}
371
372static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
373{
374 struct svcxprt_rdma *xprt = cq_context;
375 unsigned long flags;
376
377 /*
378 * Set the bit regardless of whether or not it's on the list
379 * because it may be on the list already due to an RQ
380 * completion.
381 */
382 set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
383
384 /*
385 * If this transport is not already on the DTO transport queue,
386 * add it
387 */
388 spin_lock_irqsave(&dto_lock, flags);
389 if (list_empty(&xprt->sc_dto_q))
390 list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
391 spin_unlock_irqrestore(&dto_lock, flags);
392
393 /* Tasklet does all the work to avoid irqsave locks. */
394 tasklet_schedule(&dto_tasklet);
395}
396
397static void create_context_cache(struct svcxprt_rdma *xprt,
398 int ctxt_count, int ctxt_bump, int ctxt_max)
399{
400 struct svc_rdma_op_ctxt *ctxt;
401 int i;
402
403 xprt->sc_ctxt_max = ctxt_max;
404 xprt->sc_ctxt_bump = ctxt_bump;
405 xprt->sc_ctxt_cnt = 0;
406 xprt->sc_ctxt_head = NULL;
407 for (i = 0; i < ctxt_count; i++) {
408 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
409 if (ctxt) {
410 ctxt->next = xprt->sc_ctxt_head;
411 xprt->sc_ctxt_head = ctxt;
412 xprt->sc_ctxt_cnt++;
413 }
414 }
415}
416
417static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
418{
419 struct svc_rdma_op_ctxt *next;
420 if (!ctxt)
421 return;
422
423 do {
424 next = ctxt->next;
425 kfree(ctxt);
426 ctxt = next;
427 } while (next);
428}
429
430static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
431 int listener)
432{
433 struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
434
435 if (!cma_xprt)
436 return NULL;
437 svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv);
438 INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
439 INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
440 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
441 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
442 init_waitqueue_head(&cma_xprt->sc_send_wait);
443
444 spin_lock_init(&cma_xprt->sc_lock);
445 spin_lock_init(&cma_xprt->sc_read_complete_lock);
446 spin_lock_init(&cma_xprt->sc_ctxt_lock);
447 spin_lock_init(&cma_xprt->sc_rq_dto_lock);
448
449 cma_xprt->sc_ord = svcrdma_ord;
450
451 cma_xprt->sc_max_req_size = svcrdma_max_req_size;
452 cma_xprt->sc_max_requests = svcrdma_max_requests;
453 cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
454 atomic_set(&cma_xprt->sc_sq_count, 0);
455
456 if (!listener) {
457 int reqs = cma_xprt->sc_max_requests;
458 create_context_cache(cma_xprt,
459 reqs << 1, /* starting size */
460 reqs, /* bump amount */
461 reqs +
462 cma_xprt->sc_sq_depth +
463 RPCRDMA_MAX_THREADS + 1); /* max */
464 if (!cma_xprt->sc_ctxt_head) {
465 kfree(cma_xprt);
466 return NULL;
467 }
468 clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
469 } else
470 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
471
472 return cma_xprt;
473}
474
475struct page *svc_rdma_get_page(void)
476{
477 struct page *page;
478
479 while ((page = alloc_page(GFP_KERNEL)) == NULL) {
480 /* If we can't get memory, wait a bit and try again */
481 printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
482 "jiffies.\n");
483 schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
484 }
485 return page;
486}
487
488int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
489{
490 struct ib_recv_wr recv_wr, *bad_recv_wr;
491 struct svc_rdma_op_ctxt *ctxt;
492 struct page *page;
493 unsigned long pa;
494 int sge_no;
495 int buflen;
496 int ret;
497
498 ctxt = svc_rdma_get_context(xprt);
499 buflen = 0;
500 ctxt->direction = DMA_FROM_DEVICE;
501 for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
502 BUG_ON(sge_no >= xprt->sc_max_sge);
503 page = svc_rdma_get_page();
504 ctxt->pages[sge_no] = page;
505 pa = ib_dma_map_page(xprt->sc_cm_id->device,
506 page, 0, PAGE_SIZE,
507 DMA_FROM_DEVICE);
508 ctxt->sge[sge_no].addr = pa;
509 ctxt->sge[sge_no].length = PAGE_SIZE;
510 ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
511 buflen += PAGE_SIZE;
512 }
513 ctxt->count = sge_no;
514 recv_wr.next = NULL;
515 recv_wr.sg_list = &ctxt->sge[0];
516 recv_wr.num_sge = ctxt->count;
517 recv_wr.wr_id = (u64)(unsigned long)ctxt;
518
519 ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
520 return ret;
521}
522
523/*
524 * This function handles the CONNECT_REQUEST event on a listening
525 * endpoint. It is passed the cma_id for the _new_ connection. The context in
526 * this cma_id is inherited from the listening cma_id and is the svc_xprt
527 * structure for the listening endpoint.
528 *
529 * This function creates a new xprt for the new connection and enqueues it on
530 * the accept queue for the listent xprt. When the listen thread is kicked, it
531 * will call the recvfrom method on the listen xprt which will accept the new
532 * connection.
533 */
534static void handle_connect_req(struct rdma_cm_id *new_cma_id)
535{
536 struct svcxprt_rdma *listen_xprt = new_cma_id->context;
537 struct svcxprt_rdma *newxprt;
538
539 /* Create a new transport */
540 newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
541 if (!newxprt) {
542 dprintk("svcrdma: failed to create new transport\n");
543 return;
544 }
545 newxprt->sc_cm_id = new_cma_id;
546 new_cma_id->context = newxprt;
547 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
548 newxprt, newxprt->sc_cm_id, listen_xprt);
549
550 /*
551 * Enqueue the new transport on the accept queue of the listening
552 * transport
553 */
554 spin_lock_bh(&listen_xprt->sc_lock);
555 list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
556 spin_unlock_bh(&listen_xprt->sc_lock);
557
558 /*
559 * Can't use svc_xprt_received here because we are not on a
560 * rqstp thread
561 */
562 set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
563 svc_xprt_enqueue(&listen_xprt->sc_xprt);
564}
565
566/*
567 * Handles events generated on the listening endpoint. These events will be
568 * either be incoming connect requests or adapter removal events.
569 */
570static int rdma_listen_handler(struct rdma_cm_id *cma_id,
571 struct rdma_cm_event *event)
572{
573 struct svcxprt_rdma *xprt = cma_id->context;
574 int ret = 0;
575
576 switch (event->event) {
577 case RDMA_CM_EVENT_CONNECT_REQUEST:
578 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
579 "event=%d\n", cma_id, cma_id->context, event->event);
580 handle_connect_req(cma_id);
581 break;
582
583 case RDMA_CM_EVENT_ESTABLISHED:
584 /* Accept complete */
585 dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
586 "cm_id=%p\n", xprt, cma_id);
587 break;
588
589 case RDMA_CM_EVENT_DEVICE_REMOVAL:
590 dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
591 xprt, cma_id);
592 if (xprt)
593 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
594 break;
595
596 default:
597 dprintk("svcrdma: Unexpected event on listening endpoint %p, "
598 "event=%d\n", cma_id, event->event);
599 break;
600 }
601
602 return ret;
603}
604
605static int rdma_cma_handler(struct rdma_cm_id *cma_id,
606 struct rdma_cm_event *event)
607{
608 struct svc_xprt *xprt = cma_id->context;
609 struct svcxprt_rdma *rdma =
610 container_of(xprt, struct svcxprt_rdma, sc_xprt);
611 switch (event->event) {
612 case RDMA_CM_EVENT_ESTABLISHED:
613 /* Accept complete */
614 dprintk("svcrdma: Connection completed on DTO xprt=%p, "
615 "cm_id=%p\n", xprt, cma_id);
616 clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
617 svc_xprt_enqueue(xprt);
618 break;
619 case RDMA_CM_EVENT_DISCONNECTED:
620 dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
621 xprt, cma_id);
622 if (xprt) {
623 set_bit(XPT_CLOSE, &xprt->xpt_flags);
624 svc_xprt_enqueue(xprt);
625 }
626 break;
627 case RDMA_CM_EVENT_DEVICE_REMOVAL:
628 dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
629 "event=%d\n", cma_id, xprt, event->event);
630 if (xprt) {
631 set_bit(XPT_CLOSE, &xprt->xpt_flags);
632 svc_xprt_enqueue(xprt);
633 }
634 break;
635 default:
636 dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
637 "event=%d\n", cma_id, event->event);
638 break;
639 }
640 return 0;
641}
642
643/*
644 * Create a listening RDMA service endpoint.
645 */
646static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
647 struct sockaddr *sa, int salen,
648 int flags)
649{
650 struct rdma_cm_id *listen_id;
651 struct svcxprt_rdma *cma_xprt;
652 struct svc_xprt *xprt;
653 int ret;
654
655 dprintk("svcrdma: Creating RDMA socket\n");
656
657 cma_xprt = rdma_create_xprt(serv, 1);
658 if (!cma_xprt)
659 return ERR_PTR(ENOMEM);
660 xprt = &cma_xprt->sc_xprt;
661
662 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
663 if (IS_ERR(listen_id)) {
664 rdma_destroy_xprt(cma_xprt);
665 dprintk("svcrdma: rdma_create_id failed = %ld\n",
666 PTR_ERR(listen_id));
667 return (void *)listen_id;
668 }
669 ret = rdma_bind_addr(listen_id, sa);
670 if (ret) {
671 rdma_destroy_xprt(cma_xprt);
672 rdma_destroy_id(listen_id);
673 dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
674 return ERR_PTR(ret);
675 }
676 cma_xprt->sc_cm_id = listen_id;
677
678 ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
679 if (ret) {
680 rdma_destroy_id(listen_id);
681 rdma_destroy_xprt(cma_xprt);
682 dprintk("svcrdma: rdma_listen failed = %d\n", ret);
683 }
684
685 /*
686 * We need to use the address from the cm_id in case the
687 * caller specified 0 for the port number.
688 */
689 sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
690 svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
691
692 return &cma_xprt->sc_xprt;
693}
694
695/*
696 * This is the xpo_recvfrom function for listening endpoints. Its
697 * purpose is to accept incoming connections. The CMA callback handler
698 * has already created a new transport and attached it to the new CMA
699 * ID.
700 *
701 * There is a queue of pending connections hung on the listening
702 * transport. This queue contains the new svc_xprt structure. This
703 * function takes svc_xprt structures off the accept_q and completes
704 * the connection.
705 */
706static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
707{
708 struct svcxprt_rdma *listen_rdma;
709 struct svcxprt_rdma *newxprt = NULL;
710 struct rdma_conn_param conn_param;
711 struct ib_qp_init_attr qp_attr;
712 struct ib_device_attr devattr;
713 struct sockaddr *sa;
714 int ret;
715 int i;
716
717 listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
718 clear_bit(XPT_CONN, &xprt->xpt_flags);
719 /* Get the next entry off the accept list */
720 spin_lock_bh(&listen_rdma->sc_lock);
721 if (!list_empty(&listen_rdma->sc_accept_q)) {
722 newxprt = list_entry(listen_rdma->sc_accept_q.next,
723 struct svcxprt_rdma, sc_accept_q);
724 list_del_init(&newxprt->sc_accept_q);
725 }
726 if (!list_empty(&listen_rdma->sc_accept_q))
727 set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
728 spin_unlock_bh(&listen_rdma->sc_lock);
729 if (!newxprt)
730 return NULL;
731
732 dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
733 newxprt, newxprt->sc_cm_id);
734
735 ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
736 if (ret) {
737 dprintk("svcrdma: could not query device attributes on "
738 "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
739 goto errout;
740 }
741
742 /* Qualify the transport resource defaults with the
743 * capabilities of this particular device */
744 newxprt->sc_max_sge = min((size_t)devattr.max_sge,
745 (size_t)RPCSVC_MAXPAGES);
746 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
747 (size_t)svcrdma_max_requests);
748 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
749
750 newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom,
751 (size_t)svcrdma_ord);
752
753 newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
754 if (IS_ERR(newxprt->sc_pd)) {
755 dprintk("svcrdma: error creating PD for connect request\n");
756 goto errout;
757 }
758 newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
759 sq_comp_handler,
760 cq_event_handler,
761 newxprt,
762 newxprt->sc_sq_depth,
763 0);
764 if (IS_ERR(newxprt->sc_sq_cq)) {
765 dprintk("svcrdma: error creating SQ CQ for connect request\n");
766 goto errout;
767 }
768 newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
769 rq_comp_handler,
770 cq_event_handler,
771 newxprt,
772 newxprt->sc_max_requests,
773 0);
774 if (IS_ERR(newxprt->sc_rq_cq)) {
775 dprintk("svcrdma: error creating RQ CQ for connect request\n");
776 goto errout;
777 }
778
779 memset(&qp_attr, 0, sizeof qp_attr);
780 qp_attr.event_handler = qp_event_handler;
781 qp_attr.qp_context = &newxprt->sc_xprt;
782 qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
783 qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
784 qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
785 qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
786 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
787 qp_attr.qp_type = IB_QPT_RC;
788 qp_attr.send_cq = newxprt->sc_sq_cq;
789 qp_attr.recv_cq = newxprt->sc_rq_cq;
790 dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
791 " cm_id->device=%p, sc_pd->device=%p\n"
792 " cap.max_send_wr = %d\n"
793 " cap.max_recv_wr = %d\n"
794 " cap.max_send_sge = %d\n"
795 " cap.max_recv_sge = %d\n",
796 newxprt->sc_cm_id, newxprt->sc_pd,
797 newxprt->sc_cm_id->device, newxprt->sc_pd->device,
798 qp_attr.cap.max_send_wr,
799 qp_attr.cap.max_recv_wr,
800 qp_attr.cap.max_send_sge,
801 qp_attr.cap.max_recv_sge);
802
803 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
804 if (ret) {
805 /*
806 * XXX: This is a hack. We need a xx_request_qp interface
807 * that will adjust the qp_attr's with a best-effort
808 * number
809 */
810 qp_attr.cap.max_send_sge -= 2;
811 qp_attr.cap.max_recv_sge -= 2;
812 ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
813 &qp_attr);
814 if (ret) {
815 dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
816 goto errout;
817 }
818 newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
819 newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
820 newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
821 newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
822 }
823 newxprt->sc_qp = newxprt->sc_cm_id->qp;
824
825 /* Register all of physical memory */
826 newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
827 IB_ACCESS_LOCAL_WRITE |
828 IB_ACCESS_REMOTE_WRITE);
829 if (IS_ERR(newxprt->sc_phys_mr)) {
830 dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
831 goto errout;
832 }
833
834 /* Post receive buffers */
835 for (i = 0; i < newxprt->sc_max_requests; i++) {
836 ret = svc_rdma_post_recv(newxprt);
837 if (ret) {
838 dprintk("svcrdma: failure posting receive buffers\n");
839 goto errout;
840 }
841 }
842
843 /* Swap out the handler */
844 newxprt->sc_cm_id->event_handler = rdma_cma_handler;
845
846 /* Accept Connection */
847 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
848 memset(&conn_param, 0, sizeof conn_param);
849 conn_param.responder_resources = 0;
850 conn_param.initiator_depth = newxprt->sc_ord;
851 ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
852 if (ret) {
853 dprintk("svcrdma: failed to accept new connection, ret=%d\n",
854 ret);
855 goto errout;
856 }
857
858 dprintk("svcrdma: new connection %p accepted with the following "
859 "attributes:\n"
860 " local_ip : %d.%d.%d.%d\n"
861 " local_port : %d\n"
862 " remote_ip : %d.%d.%d.%d\n"
863 " remote_port : %d\n"
864 " max_sge : %d\n"
865 " sq_depth : %d\n"
866 " max_requests : %d\n"
867 " ord : %d\n",
868 newxprt,
869 NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
870 route.addr.src_addr)->sin_addr.s_addr),
871 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
872 route.addr.src_addr)->sin_port),
873 NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
874 route.addr.dst_addr)->sin_addr.s_addr),
875 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
876 route.addr.dst_addr)->sin_port),
877 newxprt->sc_max_sge,
878 newxprt->sc_sq_depth,
879 newxprt->sc_max_requests,
880 newxprt->sc_ord);
881
882 /* Set the local and remote addresses in the transport */
883 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
884 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
885 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
886 svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
887
888 ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
889 ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
890 return &newxprt->sc_xprt;
891
892 errout:
893 dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
894 rdma_destroy_id(newxprt->sc_cm_id);
895 rdma_destroy_xprt(newxprt);
896 return NULL;
897}
898
899/*
900 * Post an RQ WQE to the RQ when the rqst is being released. This
901 * effectively returns an RQ credit to the client. The rq_xprt_ctxt
902 * will be null if the request is deferred due to an RDMA_READ or the
903 * transport had no data ready (EAGAIN). Note that an RPC deferred in
904 * svc_process will still return the credit, this is because the data
905 * is copied and no longer consume a WQE/WC.
906 */
907static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
908{
909 int err;
910 struct svcxprt_rdma *rdma =
911 container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
912 if (rqstp->rq_xprt_ctxt) {
913 BUG_ON(rqstp->rq_xprt_ctxt != rdma);
914 err = svc_rdma_post_recv(rdma);
915 if (err)
916 dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
917 err);
918 }
919 rqstp->rq_xprt_ctxt = NULL;
920}
921
922/* Disable data ready events for this connection */
923static void svc_rdma_detach(struct svc_xprt *xprt)
924{
925 struct svcxprt_rdma *rdma =
926 container_of(xprt, struct svcxprt_rdma, sc_xprt);
927 unsigned long flags;
928
929 dprintk("svc: svc_rdma_detach(%p)\n", xprt);
930 /*
931 * Shutdown the connection. This will ensure we don't get any
932 * more events from the provider.
933 */
934 rdma_disconnect(rdma->sc_cm_id);
935 rdma_destroy_id(rdma->sc_cm_id);
936
937 /* We may already be on the DTO list */
938 spin_lock_irqsave(&dto_lock, flags);
939 if (!list_empty(&rdma->sc_dto_q))
940 list_del_init(&rdma->sc_dto_q);
941 spin_unlock_irqrestore(&dto_lock, flags);
942}
943
944static void svc_rdma_free(struct svc_xprt *xprt)
945{
946 struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
947 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
948 rdma_destroy_xprt(rdma);
949 kfree(rdma);
950}
951
952static void rdma_destroy_xprt(struct svcxprt_rdma *xprt)
953{
954 if (xprt->sc_qp && !IS_ERR(xprt->sc_qp))
955 ib_destroy_qp(xprt->sc_qp);
956
957 if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq))
958 ib_destroy_cq(xprt->sc_sq_cq);
959
960 if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq))
961 ib_destroy_cq(xprt->sc_rq_cq);
962
963 if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr))
964 ib_dereg_mr(xprt->sc_phys_mr);
965
966 if (xprt->sc_pd && !IS_ERR(xprt->sc_pd))
967 ib_dealloc_pd(xprt->sc_pd);
968
969 destroy_context_cache(xprt->sc_ctxt_head);
970}
971
972static int svc_rdma_has_wspace(struct svc_xprt *xprt)
973{
974 struct svcxprt_rdma *rdma =
975 container_of(xprt, struct svcxprt_rdma, sc_xprt);
976
977 /*
978 * If there are fewer SQ WR available than required to send a
979 * simple response, return false.
980 */
981 if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
982 return 0;
983
984 /*
985 * ...or there are already waiters on the SQ,
986 * return false.
987 */
988 if (waitqueue_active(&rdma->sc_send_wait))
989 return 0;
990
991 /* Otherwise return true. */
992 return 1;
993}
994
995int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
996{
997 struct ib_send_wr *bad_wr;
998 int ret;
999
1000 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1001 return 0;
1002
1003 BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
1004 BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
1005 wr->opcode);
1006 /* If the SQ is full, wait until an SQ entry is available */
1007 while (1) {
1008 spin_lock_bh(&xprt->sc_lock);
1009 if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
1010 spin_unlock_bh(&xprt->sc_lock);
1011 atomic_inc(&rdma_stat_sq_starve);
1012 /* See if we can reap some SQ WR */
1013 sq_cq_reap(xprt);
1014
1015 /* Wait until SQ WR available if SQ still full */
1016 wait_event(xprt->sc_send_wait,
1017 atomic_read(&xprt->sc_sq_count) <
1018 xprt->sc_sq_depth);
1019 continue;
1020 }
1021 /* Bumped used SQ WR count and post */
1022 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1023 if (!ret)
1024 atomic_inc(&xprt->sc_sq_count);
1025 else
1026 dprintk("svcrdma: failed to post SQ WR rc=%d, "
1027 "sc_sq_count=%d, sc_sq_depth=%d\n",
1028 ret, atomic_read(&xprt->sc_sq_count),
1029 xprt->sc_sq_depth);
1030 spin_unlock_bh(&xprt->sc_lock);
1031 break;
1032 }
1033 return ret;
1034}
1035
1036int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1037 enum rpcrdma_errcode err)
1038{
1039 struct ib_send_wr err_wr;
1040 struct ib_sge sge;
1041 struct page *p;
1042 struct svc_rdma_op_ctxt *ctxt;
1043 u32 *va;
1044 int length;
1045 int ret;
1046
1047 p = svc_rdma_get_page();
1048 va = page_address(p);
1049
1050 /* XDR encode error */
1051 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
1052
1053 /* Prepare SGE for local address */
1054 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
1055 p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1056 sge.lkey = xprt->sc_phys_mr->lkey;
1057 sge.length = length;
1058
1059 ctxt = svc_rdma_get_context(xprt);
1060 ctxt->count = 1;
1061 ctxt->pages[0] = p;
1062
1063 /* Prepare SEND WR */
1064 memset(&err_wr, 0, sizeof err_wr);
1065 ctxt->wr_op = IB_WR_SEND;
1066 err_wr.wr_id = (unsigned long)ctxt;
1067 err_wr.sg_list = &sge;
1068 err_wr.num_sge = 1;
1069 err_wr.opcode = IB_WR_SEND;
1070 err_wr.send_flags = IB_SEND_SIGNALED;
1071
1072 /* Post It */
1073 ret = svc_rdma_send(xprt, &err_wr);
1074 if (ret) {
1075 dprintk("svcrdma: Error posting send = %d\n", ret);
1076 svc_rdma_put_context(ctxt, 1);
1077 }
1078
1079 return ret;
1080}