aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/client.c26
-rw-r--r--net/ceph/ceph_common.c26
-rw-r--r--net/ceph/messenger.c456
-rw-r--r--net/ceph/osdmap.c3
-rw-r--r--net/core/skbuff.c4
-rw-r--r--net/ipv6/route.c8
-rw-r--r--net/netfilter/nf_conntrack_core.c39
-rw-r--r--net/netfilter/nf_conntrack_proto.c21
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c45
-rw-r--r--net/netfilter/xt_CT.c31
-rw-r--r--net/netfilter/xt_LOG.c12
11 files changed, 381 insertions, 290 deletions
diff --git a/net/9p/client.c b/net/9p/client.c
index 776618cd2be5..b23a17c431c8 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -740,10 +740,18 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
740 c->status = Disconnected; 740 c->status = Disconnected;
741 goto reterr; 741 goto reterr;
742 } 742 }
743again:
743 /* Wait for the response */ 744 /* Wait for the response */
744 err = wait_event_interruptible(*req->wq, 745 err = wait_event_interruptible(*req->wq,
745 req->status >= REQ_STATUS_RCVD); 746 req->status >= REQ_STATUS_RCVD);
746 747
748 if ((err == -ERESTARTSYS) && (c->status == Connected)
749 && (type == P9_TFLUSH)) {
750 sigpending = 1;
751 clear_thread_flag(TIF_SIGPENDING);
752 goto again;
753 }
754
747 if (req->status == REQ_STATUS_ERROR) { 755 if (req->status == REQ_STATUS_ERROR) {
748 p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); 756 p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
749 err = req->t_err; 757 err = req->t_err;
@@ -1420,6 +1428,7 @@ int p9_client_clunk(struct p9_fid *fid)
1420 int err; 1428 int err;
1421 struct p9_client *clnt; 1429 struct p9_client *clnt;
1422 struct p9_req_t *req; 1430 struct p9_req_t *req;
1431 int retries = 0;
1423 1432
1424 if (!fid) { 1433 if (!fid) {
1425 pr_warn("%s (%d): Trying to clunk with NULL fid\n", 1434 pr_warn("%s (%d): Trying to clunk with NULL fid\n",
@@ -1428,7 +1437,9 @@ int p9_client_clunk(struct p9_fid *fid)
1428 return 0; 1437 return 0;
1429 } 1438 }
1430 1439
1431 p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid); 1440again:
1441 p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid,
1442 retries);
1432 err = 0; 1443 err = 0;
1433 clnt = fid->clnt; 1444 clnt = fid->clnt;
1434 1445
@@ -1444,8 +1455,14 @@ int p9_client_clunk(struct p9_fid *fid)
1444error: 1455error:
1445 /* 1456 /*
1446 * Fid is not valid even after a failed clunk 1457 * Fid is not valid even after a failed clunk
1458 * If interrupted, retry once then give up and
1459 * leak fid until umount.
1447 */ 1460 */
1448 p9_fid_destroy(fid); 1461 if (err == -ERESTARTSYS) {
1462 if (retries++ == 0)
1463 goto again;
1464 } else
1465 p9_fid_destroy(fid);
1449 return err; 1466 return err;
1450} 1467}
1451EXPORT_SYMBOL(p9_client_clunk); 1468EXPORT_SYMBOL(p9_client_clunk);
@@ -1470,7 +1487,10 @@ int p9_client_remove(struct p9_fid *fid)
1470 1487
1471 p9_free_req(clnt, req); 1488 p9_free_req(clnt, req);
1472error: 1489error:
1473 p9_fid_destroy(fid); 1490 if (err == -ERESTARTSYS)
1491 p9_client_clunk(fid);
1492 else
1493 p9_fid_destroy(fid);
1474 return err; 1494 return err;
1475} 1495}
1476EXPORT_SYMBOL(p9_client_remove); 1496EXPORT_SYMBOL(p9_client_remove);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 761ad9d6cc3b..cc913193d992 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -201,7 +201,9 @@ enum {
201 Opt_ip, 201 Opt_ip,
202 Opt_last_string, 202 Opt_last_string,
203 /* string args above */ 203 /* string args above */
204 Opt_share,
204 Opt_noshare, 205 Opt_noshare,
206 Opt_crc,
205 Opt_nocrc, 207 Opt_nocrc,
206}; 208};
207 209
@@ -217,7 +219,9 @@ static match_table_t opt_tokens = {
217 {Opt_key, "key=%s"}, 219 {Opt_key, "key=%s"},
218 {Opt_ip, "ip=%s"}, 220 {Opt_ip, "ip=%s"},
219 /* string args above */ 221 /* string args above */
222 {Opt_share, "share"},
220 {Opt_noshare, "noshare"}, 223 {Opt_noshare, "noshare"},
224 {Opt_crc, "crc"},
221 {Opt_nocrc, "nocrc"}, 225 {Opt_nocrc, "nocrc"},
222 {-1, NULL} 226 {-1, NULL}
223}; 227};
@@ -277,10 +281,11 @@ out:
277 return err; 281 return err;
278} 282}
279 283
280int ceph_parse_options(struct ceph_options **popt, char *options, 284struct ceph_options *
281 const char *dev_name, const char *dev_name_end, 285ceph_parse_options(char *options, const char *dev_name,
282 int (*parse_extra_token)(char *c, void *private), 286 const char *dev_name_end,
283 void *private) 287 int (*parse_extra_token)(char *c, void *private),
288 void *private)
284{ 289{
285 struct ceph_options *opt; 290 struct ceph_options *opt;
286 const char *c; 291 const char *c;
@@ -289,7 +294,7 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
289 294
290 opt = kzalloc(sizeof(*opt), GFP_KERNEL); 295 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
291 if (!opt) 296 if (!opt)
292 return err; 297 return ERR_PTR(-ENOMEM);
293 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), 298 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
294 GFP_KERNEL); 299 GFP_KERNEL);
295 if (!opt->mon_addr) 300 if (!opt->mon_addr)
@@ -398,10 +403,16 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
398 opt->mount_timeout = intval; 403 opt->mount_timeout = intval;
399 break; 404 break;
400 405
406 case Opt_share:
407 opt->flags &= ~CEPH_OPT_NOSHARE;
408 break;
401 case Opt_noshare: 409 case Opt_noshare:
402 opt->flags |= CEPH_OPT_NOSHARE; 410 opt->flags |= CEPH_OPT_NOSHARE;
403 break; 411 break;
404 412
413 case Opt_crc:
414 opt->flags &= ~CEPH_OPT_NOCRC;
415 break;
405 case Opt_nocrc: 416 case Opt_nocrc:
406 opt->flags |= CEPH_OPT_NOCRC; 417 opt->flags |= CEPH_OPT_NOCRC;
407 break; 418 break;
@@ -412,12 +423,11 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
412 } 423 }
413 424
414 /* success */ 425 /* success */
415 *popt = opt; 426 return opt;
416 return 0;
417 427
418out: 428out:
419 ceph_destroy_options(opt); 429 ceph_destroy_options(opt);
420 return err; 430 return ERR_PTR(err);
421} 431}
422EXPORT_SYMBOL(ceph_parse_options); 432EXPORT_SYMBOL(ceph_parse_options);
423 433
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ad5b70801f37..f0993af2ae4d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -38,48 +38,54 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
38static struct lock_class_key socket_class; 38static struct lock_class_key socket_class;
39#endif 39#endif
40 40
41/*
42 * When skipping (ignoring) a block of input we read it into a "skip
43 * buffer," which is this many bytes in size.
44 */
45#define SKIP_BUF_SIZE 1024
41 46
42static void queue_con(struct ceph_connection *con); 47static void queue_con(struct ceph_connection *con);
43static void con_work(struct work_struct *); 48static void con_work(struct work_struct *);
44static void ceph_fault(struct ceph_connection *con); 49static void ceph_fault(struct ceph_connection *con);
45 50
46/* 51/*
47 * nicely render a sockaddr as a string. 52 * Nicely render a sockaddr as a string. An array of formatted
53 * strings is used, to approximate reentrancy.
48 */ 54 */
49#define MAX_ADDR_STR 20 55#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */
50#define MAX_ADDR_STR_LEN 60 56#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG)
51static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; 57#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1)
52static DEFINE_SPINLOCK(addr_str_lock); 58#define MAX_ADDR_STR_LEN 64 /* 54 is enough */
53static int last_addr_str; 59
60static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
61static atomic_t addr_str_seq = ATOMIC_INIT(0);
62
63static struct page *zero_page; /* used in certain error cases */
54 64
55const char *ceph_pr_addr(const struct sockaddr_storage *ss) 65const char *ceph_pr_addr(const struct sockaddr_storage *ss)
56{ 66{
57 int i; 67 int i;
58 char *s; 68 char *s;
59 struct sockaddr_in *in4 = (void *)ss; 69 struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
60 struct sockaddr_in6 *in6 = (void *)ss; 70 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
61 71
62 spin_lock(&addr_str_lock); 72 i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
63 i = last_addr_str++;
64 if (last_addr_str == MAX_ADDR_STR)
65 last_addr_str = 0;
66 spin_unlock(&addr_str_lock);
67 s = addr_str[i]; 73 s = addr_str[i];
68 74
69 switch (ss->ss_family) { 75 switch (ss->ss_family) {
70 case AF_INET: 76 case AF_INET:
71 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, 77 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
72 (unsigned int)ntohs(in4->sin_port)); 78 ntohs(in4->sin_port));
73 break; 79 break;
74 80
75 case AF_INET6: 81 case AF_INET6:
76 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, 82 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
77 (unsigned int)ntohs(in6->sin6_port)); 83 ntohs(in6->sin6_port));
78 break; 84 break;
79 85
80 default: 86 default:
81 snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", 87 snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
82 (int)ss->ss_family); 88 ss->ss_family);
83 } 89 }
84 90
85 return s; 91 return s;
@@ -95,22 +101,43 @@ static void encode_my_addr(struct ceph_messenger *msgr)
95/* 101/*
96 * work queue for all reading and writing to/from the socket. 102 * work queue for all reading and writing to/from the socket.
97 */ 103 */
98struct workqueue_struct *ceph_msgr_wq; 104static struct workqueue_struct *ceph_msgr_wq;
105
106void _ceph_msgr_exit(void)
107{
108 if (ceph_msgr_wq) {
109 destroy_workqueue(ceph_msgr_wq);
110 ceph_msgr_wq = NULL;
111 }
112
113 BUG_ON(zero_page == NULL);
114 kunmap(zero_page);
115 page_cache_release(zero_page);
116 zero_page = NULL;
117}
99 118
100int ceph_msgr_init(void) 119int ceph_msgr_init(void)
101{ 120{
121 BUG_ON(zero_page != NULL);
122 zero_page = ZERO_PAGE(0);
123 page_cache_get(zero_page);
124
102 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); 125 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
103 if (!ceph_msgr_wq) { 126 if (ceph_msgr_wq)
104 pr_err("msgr_init failed to create workqueue\n"); 127 return 0;
105 return -ENOMEM; 128
106 } 129 pr_err("msgr_init failed to create workqueue\n");
107 return 0; 130 _ceph_msgr_exit();
131
132 return -ENOMEM;
108} 133}
109EXPORT_SYMBOL(ceph_msgr_init); 134EXPORT_SYMBOL(ceph_msgr_init);
110 135
111void ceph_msgr_exit(void) 136void ceph_msgr_exit(void)
112{ 137{
113 destroy_workqueue(ceph_msgr_wq); 138 BUG_ON(ceph_msgr_wq == NULL);
139
140 _ceph_msgr_exit();
114} 141}
115EXPORT_SYMBOL(ceph_msgr_exit); 142EXPORT_SYMBOL(ceph_msgr_exit);
116 143
@@ -128,8 +155,8 @@ EXPORT_SYMBOL(ceph_msgr_flush);
128/* data available on socket, or listen socket received a connect */ 155/* data available on socket, or listen socket received a connect */
129static void ceph_data_ready(struct sock *sk, int count_unused) 156static void ceph_data_ready(struct sock *sk, int count_unused)
130{ 157{
131 struct ceph_connection *con = 158 struct ceph_connection *con = sk->sk_user_data;
132 (struct ceph_connection *)sk->sk_user_data; 159
133 if (sk->sk_state != TCP_CLOSE_WAIT) { 160 if (sk->sk_state != TCP_CLOSE_WAIT) {
134 dout("ceph_data_ready on %p state = %lu, queueing work\n", 161 dout("ceph_data_ready on %p state = %lu, queueing work\n",
135 con, con->state); 162 con, con->state);
@@ -140,26 +167,30 @@ static void ceph_data_ready(struct sock *sk, int count_unused)
140/* socket has buffer space for writing */ 167/* socket has buffer space for writing */
141static void ceph_write_space(struct sock *sk) 168static void ceph_write_space(struct sock *sk)
142{ 169{
143 struct ceph_connection *con = 170 struct ceph_connection *con = sk->sk_user_data;
144 (struct ceph_connection *)sk->sk_user_data;
145 171
146 /* only queue to workqueue if there is data we want to write. */ 172 /* only queue to workqueue if there is data we want to write,
173 * and there is sufficient space in the socket buffer to accept
174 * more data. clear SOCK_NOSPACE so that ceph_write_space()
175 * doesn't get called again until try_write() fills the socket
176 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
177 * and net/core/stream.c:sk_stream_write_space().
178 */
147 if (test_bit(WRITE_PENDING, &con->state)) { 179 if (test_bit(WRITE_PENDING, &con->state)) {
148 dout("ceph_write_space %p queueing write work\n", con); 180 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
149 queue_con(con); 181 dout("ceph_write_space %p queueing write work\n", con);
182 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
183 queue_con(con);
184 }
150 } else { 185 } else {
151 dout("ceph_write_space %p nothing to write\n", con); 186 dout("ceph_write_space %p nothing to write\n", con);
152 } 187 }
153
154 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
155 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
156} 188}
157 189
158/* socket's state has changed */ 190/* socket's state has changed */
159static void ceph_state_change(struct sock *sk) 191static void ceph_state_change(struct sock *sk)
160{ 192{
161 struct ceph_connection *con = 193 struct ceph_connection *con = sk->sk_user_data;
162 (struct ceph_connection *)sk->sk_user_data;
163 194
164 dout("ceph_state_change %p state = %lu sk_state = %u\n", 195 dout("ceph_state_change %p state = %lu sk_state = %u\n",
165 con, con->state, sk->sk_state); 196 con, con->state, sk->sk_state);
@@ -184,6 +215,8 @@ static void ceph_state_change(struct sock *sk)
184 dout("ceph_state_change TCP_ESTABLISHED\n"); 215 dout("ceph_state_change TCP_ESTABLISHED\n");
185 queue_con(con); 216 queue_con(con);
186 break; 217 break;
218 default: /* Everything else is uninteresting */
219 break;
187 } 220 }
188} 221}
189 222
@@ -194,7 +227,7 @@ static void set_sock_callbacks(struct socket *sock,
194 struct ceph_connection *con) 227 struct ceph_connection *con)
195{ 228{
196 struct sock *sk = sock->sk; 229 struct sock *sk = sock->sk;
197 sk->sk_user_data = (void *)con; 230 sk->sk_user_data = con;
198 sk->sk_data_ready = ceph_data_ready; 231 sk->sk_data_ready = ceph_data_ready;
199 sk->sk_write_space = ceph_write_space; 232 sk->sk_write_space = ceph_write_space;
200 sk->sk_state_change = ceph_state_change; 233 sk->sk_state_change = ceph_state_change;
@@ -208,7 +241,7 @@ static void set_sock_callbacks(struct socket *sock,
208/* 241/*
209 * initiate connection to a remote socket. 242 * initiate connection to a remote socket.
210 */ 243 */
211static struct socket *ceph_tcp_connect(struct ceph_connection *con) 244static int ceph_tcp_connect(struct ceph_connection *con)
212{ 245{
213 struct sockaddr_storage *paddr = &con->peer_addr.in_addr; 246 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
214 struct socket *sock; 247 struct socket *sock;
@@ -218,8 +251,7 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
218 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, 251 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
219 IPPROTO_TCP, &sock); 252 IPPROTO_TCP, &sock);
220 if (ret) 253 if (ret)
221 return ERR_PTR(ret); 254 return ret;
222 con->sock = sock;
223 sock->sk->sk_allocation = GFP_NOFS; 255 sock->sk->sk_allocation = GFP_NOFS;
224 256
225#ifdef CONFIG_LOCKDEP 257#ifdef CONFIG_LOCKDEP
@@ -236,19 +268,17 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
236 dout("connect %s EINPROGRESS sk_state = %u\n", 268 dout("connect %s EINPROGRESS sk_state = %u\n",
237 ceph_pr_addr(&con->peer_addr.in_addr), 269 ceph_pr_addr(&con->peer_addr.in_addr),
238 sock->sk->sk_state); 270 sock->sk->sk_state);
239 ret = 0; 271 } else if (ret < 0) {
240 }
241 if (ret < 0) {
242 pr_err("connect %s error %d\n", 272 pr_err("connect %s error %d\n",
243 ceph_pr_addr(&con->peer_addr.in_addr), ret); 273 ceph_pr_addr(&con->peer_addr.in_addr), ret);
244 sock_release(sock); 274 sock_release(sock);
245 con->sock = NULL;
246 con->error_msg = "connect error"; 275 con->error_msg = "connect error";
276
277 return ret;
247 } 278 }
279 con->sock = sock;
248 280
249 if (ret < 0) 281 return 0;
250 return ERR_PTR(ret);
251 return sock;
252} 282}
253 283
254static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) 284static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
@@ -284,6 +314,19 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
284 return r; 314 return r;
285} 315}
286 316
317static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
318 int offset, size_t size, int more)
319{
320 int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
321 int ret;
322
323 ret = kernel_sendpage(sock, page, offset, size, flags);
324 if (ret == -EAGAIN)
325 ret = 0;
326
327 return ret;
328}
329
287 330
288/* 331/*
289 * Shutdown/close the socket for the given connection. 332 * Shutdown/close the socket for the given connection.
@@ -391,22 +434,23 @@ bool ceph_con_opened(struct ceph_connection *con)
391 */ 434 */
392struct ceph_connection *ceph_con_get(struct ceph_connection *con) 435struct ceph_connection *ceph_con_get(struct ceph_connection *con)
393{ 436{
394 dout("con_get %p nref = %d -> %d\n", con, 437 int nref = __atomic_add_unless(&con->nref, 1, 0);
395 atomic_read(&con->nref), atomic_read(&con->nref) + 1); 438
396 if (atomic_inc_not_zero(&con->nref)) 439 dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1);
397 return con; 440
398 return NULL; 441 return nref ? con : NULL;
399} 442}
400 443
401void ceph_con_put(struct ceph_connection *con) 444void ceph_con_put(struct ceph_connection *con)
402{ 445{
403 dout("con_put %p nref = %d -> %d\n", con, 446 int nref = atomic_dec_return(&con->nref);
404 atomic_read(&con->nref), atomic_read(&con->nref) - 1); 447
405 BUG_ON(atomic_read(&con->nref) == 0); 448 BUG_ON(nref < 0);
406 if (atomic_dec_and_test(&con->nref)) { 449 if (nref == 0) {
407 BUG_ON(con->sock); 450 BUG_ON(con->sock);
408 kfree(con); 451 kfree(con);
409 } 452 }
453 dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref);
410} 454}
411 455
412/* 456/*
@@ -442,14 +486,35 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
442 return ret; 486 return ret;
443} 487}
444 488
489static void ceph_con_out_kvec_reset(struct ceph_connection *con)
490{
491 con->out_kvec_left = 0;
492 con->out_kvec_bytes = 0;
493 con->out_kvec_cur = &con->out_kvec[0];
494}
495
496static void ceph_con_out_kvec_add(struct ceph_connection *con,
497 size_t size, void *data)
498{
499 int index;
500
501 index = con->out_kvec_left;
502 BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
503
504 con->out_kvec[index].iov_len = size;
505 con->out_kvec[index].iov_base = data;
506 con->out_kvec_left++;
507 con->out_kvec_bytes += size;
508}
445 509
446/* 510/*
447 * Prepare footer for currently outgoing message, and finish things 511 * Prepare footer for currently outgoing message, and finish things
448 * off. Assumes out_kvec* are already valid.. we just add on to the end. 512 * off. Assumes out_kvec* are already valid.. we just add on to the end.
449 */ 513 */
450static void prepare_write_message_footer(struct ceph_connection *con, int v) 514static void prepare_write_message_footer(struct ceph_connection *con)
451{ 515{
452 struct ceph_msg *m = con->out_msg; 516 struct ceph_msg *m = con->out_msg;
517 int v = con->out_kvec_left;
453 518
454 dout("prepare_write_message_footer %p\n", con); 519 dout("prepare_write_message_footer %p\n", con);
455 con->out_kvec_is_msg = true; 520 con->out_kvec_is_msg = true;
@@ -467,9 +532,9 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v)
467static void prepare_write_message(struct ceph_connection *con) 532static void prepare_write_message(struct ceph_connection *con)
468{ 533{
469 struct ceph_msg *m; 534 struct ceph_msg *m;
470 int v = 0; 535 u32 crc;
471 536
472 con->out_kvec_bytes = 0; 537 ceph_con_out_kvec_reset(con);
473 con->out_kvec_is_msg = true; 538 con->out_kvec_is_msg = true;
474 con->out_msg_done = false; 539 con->out_msg_done = false;
475 540
@@ -477,16 +542,13 @@ static void prepare_write_message(struct ceph_connection *con)
477 * TCP packet that's a good thing. */ 542 * TCP packet that's a good thing. */
478 if (con->in_seq > con->in_seq_acked) { 543 if (con->in_seq > con->in_seq_acked) {
479 con->in_seq_acked = con->in_seq; 544 con->in_seq_acked = con->in_seq;
480 con->out_kvec[v].iov_base = &tag_ack; 545 ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
481 con->out_kvec[v++].iov_len = 1;
482 con->out_temp_ack = cpu_to_le64(con->in_seq_acked); 546 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
483 con->out_kvec[v].iov_base = &con->out_temp_ack; 547 ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
484 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); 548 &con->out_temp_ack);
485 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
486 } 549 }
487 550
488 m = list_first_entry(&con->out_queue, 551 m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
489 struct ceph_msg, list_head);
490 con->out_msg = m; 552 con->out_msg = m;
491 553
492 /* put message on sent list */ 554 /* put message on sent list */
@@ -510,30 +572,26 @@ static void prepare_write_message(struct ceph_connection *con)
510 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); 572 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
511 573
512 /* tag + hdr + front + middle */ 574 /* tag + hdr + front + middle */
513 con->out_kvec[v].iov_base = &tag_msg; 575 ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
514 con->out_kvec[v++].iov_len = 1; 576 ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
515 con->out_kvec[v].iov_base = &m->hdr; 577 ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
516 con->out_kvec[v++].iov_len = sizeof(m->hdr); 578
517 con->out_kvec[v++] = m->front;
518 if (m->middle) 579 if (m->middle)
519 con->out_kvec[v++] = m->middle->vec; 580 ceph_con_out_kvec_add(con, m->middle->vec.iov_len,
520 con->out_kvec_left = v; 581 m->middle->vec.iov_base);
521 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
522 (m->middle ? m->middle->vec.iov_len : 0);
523 con->out_kvec_cur = con->out_kvec;
524 582
525 /* fill in crc (except data pages), footer */ 583 /* fill in crc (except data pages), footer */
526 con->out_msg->hdr.crc = 584 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
527 cpu_to_le32(crc32c(0, (void *)&m->hdr, 585 con->out_msg->hdr.crc = cpu_to_le32(crc);
528 sizeof(m->hdr) - sizeof(m->hdr.crc)));
529 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; 586 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
530 con->out_msg->footer.front_crc = 587
531 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); 588 crc = crc32c(0, m->front.iov_base, m->front.iov_len);
532 if (m->middle) 589 con->out_msg->footer.front_crc = cpu_to_le32(crc);
533 con->out_msg->footer.middle_crc = 590 if (m->middle) {
534 cpu_to_le32(crc32c(0, m->middle->vec.iov_base, 591 crc = crc32c(0, m->middle->vec.iov_base,
535 m->middle->vec.iov_len)); 592 m->middle->vec.iov_len);
536 else 593 con->out_msg->footer.middle_crc = cpu_to_le32(crc);
594 } else
537 con->out_msg->footer.middle_crc = 0; 595 con->out_msg->footer.middle_crc = 0;
538 con->out_msg->footer.data_crc = 0; 596 con->out_msg->footer.data_crc = 0;
539 dout("prepare_write_message front_crc %u data_crc %u\n", 597 dout("prepare_write_message front_crc %u data_crc %u\n",
@@ -549,11 +607,11 @@ static void prepare_write_message(struct ceph_connection *con)
549 else 607 else
550 con->out_msg_pos.page_pos = 0; 608 con->out_msg_pos.page_pos = 0;
551 con->out_msg_pos.data_pos = 0; 609 con->out_msg_pos.data_pos = 0;
552 con->out_msg_pos.did_page_crc = 0; 610 con->out_msg_pos.did_page_crc = false;
553 con->out_more = 1; /* data + footer will follow */ 611 con->out_more = 1; /* data + footer will follow */
554 } else { 612 } else {
555 /* no, queue up footer too and be done */ 613 /* no, queue up footer too and be done */
556 prepare_write_message_footer(con, v); 614 prepare_write_message_footer(con);
557 } 615 }
558 616
559 set_bit(WRITE_PENDING, &con->state); 617 set_bit(WRITE_PENDING, &con->state);
@@ -568,14 +626,14 @@ static void prepare_write_ack(struct ceph_connection *con)
568 con->in_seq_acked, con->in_seq); 626 con->in_seq_acked, con->in_seq);
569 con->in_seq_acked = con->in_seq; 627 con->in_seq_acked = con->in_seq;
570 628
571 con->out_kvec[0].iov_base = &tag_ack; 629 ceph_con_out_kvec_reset(con);
572 con->out_kvec[0].iov_len = 1; 630
631 ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
632
573 con->out_temp_ack = cpu_to_le64(con->in_seq_acked); 633 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
574 con->out_kvec[1].iov_base = &con->out_temp_ack; 634 ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
575 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); 635 &con->out_temp_ack);
576 con->out_kvec_left = 2; 636
577 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
578 con->out_kvec_cur = con->out_kvec;
579 con->out_more = 1; /* more will follow.. eventually.. */ 637 con->out_more = 1; /* more will follow.. eventually.. */
580 set_bit(WRITE_PENDING, &con->state); 638 set_bit(WRITE_PENDING, &con->state);
581} 639}
@@ -586,11 +644,8 @@ static void prepare_write_ack(struct ceph_connection *con)
586static void prepare_write_keepalive(struct ceph_connection *con) 644static void prepare_write_keepalive(struct ceph_connection *con)
587{ 645{
588 dout("prepare_write_keepalive %p\n", con); 646 dout("prepare_write_keepalive %p\n", con);
589 con->out_kvec[0].iov_base = &tag_keepalive; 647 ceph_con_out_kvec_reset(con);
590 con->out_kvec[0].iov_len = 1; 648 ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
591 con->out_kvec_left = 1;
592 con->out_kvec_bytes = 1;
593 con->out_kvec_cur = con->out_kvec;
594 set_bit(WRITE_PENDING, &con->state); 649 set_bit(WRITE_PENDING, &con->state);
595} 650}
596 651
@@ -619,12 +674,9 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
619 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); 674 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
620 con->out_connect.authorizer_len = cpu_to_le32(auth_len); 675 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
621 676
622 if (auth_len) { 677 if (auth_len)
623 con->out_kvec[con->out_kvec_left].iov_base = auth_buf; 678 ceph_con_out_kvec_add(con, auth_len, auth_buf);
624 con->out_kvec[con->out_kvec_left].iov_len = auth_len; 679
625 con->out_kvec_left++;
626 con->out_kvec_bytes += auth_len;
627 }
628 return 0; 680 return 0;
629} 681}
630 682
@@ -634,22 +686,18 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
634static void prepare_write_banner(struct ceph_messenger *msgr, 686static void prepare_write_banner(struct ceph_messenger *msgr,
635 struct ceph_connection *con) 687 struct ceph_connection *con)
636{ 688{
637 int len = strlen(CEPH_BANNER); 689 ceph_con_out_kvec_reset(con);
690 ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
691 ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr),
692 &msgr->my_enc_addr);
638 693
639 con->out_kvec[0].iov_base = CEPH_BANNER;
640 con->out_kvec[0].iov_len = len;
641 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
642 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
643 con->out_kvec_left = 2;
644 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
645 con->out_kvec_cur = con->out_kvec;
646 con->out_more = 0; 694 con->out_more = 0;
647 set_bit(WRITE_PENDING, &con->state); 695 set_bit(WRITE_PENDING, &con->state);
648} 696}
649 697
650static int prepare_write_connect(struct ceph_messenger *msgr, 698static int prepare_write_connect(struct ceph_messenger *msgr,
651 struct ceph_connection *con, 699 struct ceph_connection *con,
652 int after_banner) 700 int include_banner)
653{ 701{
654 unsigned global_seq = get_global_seq(con->msgr, 0); 702 unsigned global_seq = get_global_seq(con->msgr, 0);
655 int proto; 703 int proto;
@@ -678,22 +726,18 @@ static int prepare_write_connect(struct ceph_messenger *msgr,
678 con->out_connect.protocol_version = cpu_to_le32(proto); 726 con->out_connect.protocol_version = cpu_to_le32(proto);
679 con->out_connect.flags = 0; 727 con->out_connect.flags = 0;
680 728
681 if (!after_banner) { 729 if (include_banner)
682 con->out_kvec_left = 0; 730 prepare_write_banner(msgr, con);
683 con->out_kvec_bytes = 0; 731 else
684 } 732 ceph_con_out_kvec_reset(con);
685 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; 733 ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
686 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); 734
687 con->out_kvec_left++;
688 con->out_kvec_bytes += sizeof(con->out_connect);
689 con->out_kvec_cur = con->out_kvec;
690 con->out_more = 0; 735 con->out_more = 0;
691 set_bit(WRITE_PENDING, &con->state); 736 set_bit(WRITE_PENDING, &con->state);
692 737
693 return prepare_connect_authorizer(con); 738 return prepare_connect_authorizer(con);
694} 739}
695 740
696
697/* 741/*
698 * write as much of pending kvecs to the socket as we can. 742 * write as much of pending kvecs to the socket as we can.
699 * 1 -> done 743 * 1 -> done
@@ -714,17 +758,18 @@ static int write_partial_kvec(struct ceph_connection *con)
714 con->out_kvec_bytes -= ret; 758 con->out_kvec_bytes -= ret;
715 if (con->out_kvec_bytes == 0) 759 if (con->out_kvec_bytes == 0)
716 break; /* done */ 760 break; /* done */
717 while (ret > 0) { 761
718 if (ret >= con->out_kvec_cur->iov_len) { 762 /* account for full iov entries consumed */
719 ret -= con->out_kvec_cur->iov_len; 763 while (ret >= con->out_kvec_cur->iov_len) {
720 con->out_kvec_cur++; 764 BUG_ON(!con->out_kvec_left);
721 con->out_kvec_left--; 765 ret -= con->out_kvec_cur->iov_len;
722 } else { 766 con->out_kvec_cur++;
723 con->out_kvec_cur->iov_len -= ret; 767 con->out_kvec_left--;
724 con->out_kvec_cur->iov_base += ret; 768 }
725 ret = 0; 769 /* and for a partially-consumed entry */
726 break; 770 if (ret) {
727 } 771 con->out_kvec_cur->iov_len -= ret;
772 con->out_kvec_cur->iov_base += ret;
728 } 773 }
729 } 774 }
730 con->out_kvec_left = 0; 775 con->out_kvec_left = 0;
@@ -773,7 +818,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
773 struct ceph_msg *msg = con->out_msg; 818 struct ceph_msg *msg = con->out_msg;
774 unsigned data_len = le32_to_cpu(msg->hdr.data_len); 819 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
775 size_t len; 820 size_t len;
776 int crc = con->msgr->nocrc; 821 bool do_datacrc = !con->msgr->nocrc;
777 int ret; 822 int ret;
778 int total_max_write; 823 int total_max_write;
779 int in_trail = 0; 824 int in_trail = 0;
@@ -790,9 +835,8 @@ static int write_partial_msg_pages(struct ceph_connection *con)
790 835
791 while (data_len > con->out_msg_pos.data_pos) { 836 while (data_len > con->out_msg_pos.data_pos) {
792 struct page *page = NULL; 837 struct page *page = NULL;
793 void *kaddr = NULL;
794 int max_write = PAGE_SIZE; 838 int max_write = PAGE_SIZE;
795 int page_shift = 0; 839 int bio_offset = 0;
796 840
797 total_max_write = data_len - trail_len - 841 total_max_write = data_len - trail_len -
798 con->out_msg_pos.data_pos; 842 con->out_msg_pos.data_pos;
@@ -811,58 +855,47 @@ static int write_partial_msg_pages(struct ceph_connection *con)
811 855
812 page = list_first_entry(&msg->trail->head, 856 page = list_first_entry(&msg->trail->head,
813 struct page, lru); 857 struct page, lru);
814 if (crc)
815 kaddr = kmap(page);
816 max_write = PAGE_SIZE; 858 max_write = PAGE_SIZE;
817 } else if (msg->pages) { 859 } else if (msg->pages) {
818 page = msg->pages[con->out_msg_pos.page]; 860 page = msg->pages[con->out_msg_pos.page];
819 if (crc)
820 kaddr = kmap(page);
821 } else if (msg->pagelist) { 861 } else if (msg->pagelist) {
822 page = list_first_entry(&msg->pagelist->head, 862 page = list_first_entry(&msg->pagelist->head,
823 struct page, lru); 863 struct page, lru);
824 if (crc)
825 kaddr = kmap(page);
826#ifdef CONFIG_BLOCK 864#ifdef CONFIG_BLOCK
827 } else if (msg->bio) { 865 } else if (msg->bio) {
828 struct bio_vec *bv; 866 struct bio_vec *bv;
829 867
830 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); 868 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
831 page = bv->bv_page; 869 page = bv->bv_page;
832 page_shift = bv->bv_offset; 870 bio_offset = bv->bv_offset;
833 if (crc)
834 kaddr = kmap(page) + page_shift;
835 max_write = bv->bv_len; 871 max_write = bv->bv_len;
836#endif 872#endif
837 } else { 873 } else {
838 page = con->msgr->zero_page; 874 page = zero_page;
839 if (crc)
840 kaddr = page_address(con->msgr->zero_page);
841 } 875 }
842 len = min_t(int, max_write - con->out_msg_pos.page_pos, 876 len = min_t(int, max_write - con->out_msg_pos.page_pos,
843 total_max_write); 877 total_max_write);
844 878
845 if (crc && !con->out_msg_pos.did_page_crc) { 879 if (do_datacrc && !con->out_msg_pos.did_page_crc) {
846 void *base = kaddr + con->out_msg_pos.page_pos; 880 void *base;
881 u32 crc;
847 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); 882 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
883 char *kaddr;
848 884
885 kaddr = kmap(page);
849 BUG_ON(kaddr == NULL); 886 BUG_ON(kaddr == NULL);
850 con->out_msg->footer.data_crc = 887 base = kaddr + con->out_msg_pos.page_pos + bio_offset;
851 cpu_to_le32(crc32c(tmpcrc, base, len)); 888 crc = crc32c(tmpcrc, base, len);
852 con->out_msg_pos.did_page_crc = 1; 889 con->out_msg->footer.data_crc = cpu_to_le32(crc);
890 con->out_msg_pos.did_page_crc = true;
853 } 891 }
854 ret = kernel_sendpage(con->sock, page, 892 ret = ceph_tcp_sendpage(con->sock, page,
855 con->out_msg_pos.page_pos + page_shift, 893 con->out_msg_pos.page_pos + bio_offset,
856 len, 894 len, 1);
857 MSG_DONTWAIT | MSG_NOSIGNAL | 895
858 MSG_MORE); 896 if (do_datacrc)
859
860 if (crc &&
861 (msg->pages || msg->pagelist || msg->bio || in_trail))
862 kunmap(page); 897 kunmap(page);
863 898
864 if (ret == -EAGAIN)
865 ret = 0;
866 if (ret <= 0) 899 if (ret <= 0)
867 goto out; 900 goto out;
868 901
@@ -871,7 +904,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
871 if (ret == len) { 904 if (ret == len) {
872 con->out_msg_pos.page_pos = 0; 905 con->out_msg_pos.page_pos = 0;
873 con->out_msg_pos.page++; 906 con->out_msg_pos.page++;
874 con->out_msg_pos.did_page_crc = 0; 907 con->out_msg_pos.did_page_crc = false;
875 if (in_trail) 908 if (in_trail)
876 list_move_tail(&page->lru, 909 list_move_tail(&page->lru,
877 &msg->trail->head); 910 &msg->trail->head);
@@ -888,12 +921,10 @@ static int write_partial_msg_pages(struct ceph_connection *con)
888 dout("write_partial_msg_pages %p msg %p done\n", con, msg); 921 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
889 922
890 /* prepare and queue up footer, too */ 923 /* prepare and queue up footer, too */
891 if (!crc) 924 if (!do_datacrc)
892 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 925 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
893 con->out_kvec_bytes = 0; 926 ceph_con_out_kvec_reset(con);
894 con->out_kvec_left = 0; 927 prepare_write_message_footer(con);
895 con->out_kvec_cur = con->out_kvec;
896 prepare_write_message_footer(con, 0);
897 ret = 1; 928 ret = 1;
898out: 929out:
899 return ret; 930 return ret;
@@ -907,12 +938,9 @@ static int write_partial_skip(struct ceph_connection *con)
907 int ret; 938 int ret;
908 939
909 while (con->out_skip > 0) { 940 while (con->out_skip > 0) {
910 struct kvec iov = { 941 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
911 .iov_base = page_address(con->msgr->zero_page),
912 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
913 };
914 942
915 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); 943 ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1);
916 if (ret <= 0) 944 if (ret <= 0)
917 goto out; 945 goto out;
918 con->out_skip -= ret; 946 con->out_skip -= ret;
@@ -1085,8 +1113,8 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)
1085static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, 1113static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
1086 char delim, const char **ipend) 1114 char delim, const char **ipend)
1087{ 1115{
1088 struct sockaddr_in *in4 = (void *)ss; 1116 struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
1089 struct sockaddr_in6 *in6 = (void *)ss; 1117 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
1090 1118
1091 memset(ss, 0, sizeof(*ss)); 1119 memset(ss, 0, sizeof(*ss));
1092 1120
@@ -1512,10 +1540,9 @@ static int read_partial_message_section(struct ceph_connection *con,
1512 if (ret <= 0) 1540 if (ret <= 0)
1513 return ret; 1541 return ret;
1514 section->iov_len += ret; 1542 section->iov_len += ret;
1515 if (section->iov_len == sec_len)
1516 *crc = crc32c(0, section->iov_base,
1517 section->iov_len);
1518 } 1543 }
1544 if (section->iov_len == sec_len)
1545 *crc = crc32c(0, section->iov_base, section->iov_len);
1519 1546
1520 return 1; 1547 return 1;
1521} 1548}
@@ -1527,7 +1554,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1527 1554
1528static int read_partial_message_pages(struct ceph_connection *con, 1555static int read_partial_message_pages(struct ceph_connection *con,
1529 struct page **pages, 1556 struct page **pages,
1530 unsigned data_len, int datacrc) 1557 unsigned data_len, bool do_datacrc)
1531{ 1558{
1532 void *p; 1559 void *p;
1533 int ret; 1560 int ret;
@@ -1540,7 +1567,7 @@ static int read_partial_message_pages(struct ceph_connection *con,
1540 p = kmap(pages[con->in_msg_pos.page]); 1567 p = kmap(pages[con->in_msg_pos.page]);
1541 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 1568 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1542 left); 1569 left);
1543 if (ret > 0 && datacrc) 1570 if (ret > 0 && do_datacrc)
1544 con->in_data_crc = 1571 con->in_data_crc =
1545 crc32c(con->in_data_crc, 1572 crc32c(con->in_data_crc,
1546 p + con->in_msg_pos.page_pos, ret); 1573 p + con->in_msg_pos.page_pos, ret);
@@ -1560,7 +1587,7 @@ static int read_partial_message_pages(struct ceph_connection *con,
1560#ifdef CONFIG_BLOCK 1587#ifdef CONFIG_BLOCK
1561static int read_partial_message_bio(struct ceph_connection *con, 1588static int read_partial_message_bio(struct ceph_connection *con,
1562 struct bio **bio_iter, int *bio_seg, 1589 struct bio **bio_iter, int *bio_seg,
1563 unsigned data_len, int datacrc) 1590 unsigned data_len, bool do_datacrc)
1564{ 1591{
1565 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); 1592 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1566 void *p; 1593 void *p;
@@ -1576,7 +1603,7 @@ static int read_partial_message_bio(struct ceph_connection *con,
1576 1603
1577 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 1604 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1578 left); 1605 left);
1579 if (ret > 0 && datacrc) 1606 if (ret > 0 && do_datacrc)
1580 con->in_data_crc = 1607 con->in_data_crc =
1581 crc32c(con->in_data_crc, 1608 crc32c(con->in_data_crc,
1582 p + con->in_msg_pos.page_pos, ret); 1609 p + con->in_msg_pos.page_pos, ret);
@@ -1603,9 +1630,10 @@ static int read_partial_message(struct ceph_connection *con)
1603 int ret; 1630 int ret;
1604 int to, left; 1631 int to, left;
1605 unsigned front_len, middle_len, data_len; 1632 unsigned front_len, middle_len, data_len;
1606 int datacrc = con->msgr->nocrc; 1633 bool do_datacrc = !con->msgr->nocrc;
1607 int skip; 1634 int skip;
1608 u64 seq; 1635 u64 seq;
1636 u32 crc;
1609 1637
1610 dout("read_partial_message con %p msg %p\n", con, m); 1638 dout("read_partial_message con %p msg %p\n", con, m);
1611 1639
@@ -1618,17 +1646,16 @@ static int read_partial_message(struct ceph_connection *con)
1618 if (ret <= 0) 1646 if (ret <= 0)
1619 return ret; 1647 return ret;
1620 con->in_base_pos += ret; 1648 con->in_base_pos += ret;
1621 if (con->in_base_pos == sizeof(con->in_hdr)) {
1622 u32 crc = crc32c(0, (void *)&con->in_hdr,
1623 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1624 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1625 pr_err("read_partial_message bad hdr "
1626 " crc %u != expected %u\n",
1627 crc, con->in_hdr.crc);
1628 return -EBADMSG;
1629 }
1630 }
1631 } 1649 }
1650
1651 crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
1652 if (cpu_to_le32(crc) != con->in_hdr.crc) {
1653 pr_err("read_partial_message bad hdr "
1654 " crc %u != expected %u\n",
1655 crc, con->in_hdr.crc);
1656 return -EBADMSG;
1657 }
1658
1632 front_len = le32_to_cpu(con->in_hdr.front_len); 1659 front_len = le32_to_cpu(con->in_hdr.front_len);
1633 if (front_len > CEPH_MSG_MAX_FRONT_LEN) 1660 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1634 return -EIO; 1661 return -EIO;
@@ -1714,7 +1741,7 @@ static int read_partial_message(struct ceph_connection *con)
1714 while (con->in_msg_pos.data_pos < data_len) { 1741 while (con->in_msg_pos.data_pos < data_len) {
1715 if (m->pages) { 1742 if (m->pages) {
1716 ret = read_partial_message_pages(con, m->pages, 1743 ret = read_partial_message_pages(con, m->pages,
1717 data_len, datacrc); 1744 data_len, do_datacrc);
1718 if (ret <= 0) 1745 if (ret <= 0)
1719 return ret; 1746 return ret;
1720#ifdef CONFIG_BLOCK 1747#ifdef CONFIG_BLOCK
@@ -1722,7 +1749,7 @@ static int read_partial_message(struct ceph_connection *con)
1722 1749
1723 ret = read_partial_message_bio(con, 1750 ret = read_partial_message_bio(con,
1724 &m->bio_iter, &m->bio_seg, 1751 &m->bio_iter, &m->bio_seg,
1725 data_len, datacrc); 1752 data_len, do_datacrc);
1726 if (ret <= 0) 1753 if (ret <= 0)
1727 return ret; 1754 return ret;
1728#endif 1755#endif
@@ -1757,7 +1784,7 @@ static int read_partial_message(struct ceph_connection *con)
1757 m, con->in_middle_crc, m->footer.middle_crc); 1784 m, con->in_middle_crc, m->footer.middle_crc);
1758 return -EBADMSG; 1785 return -EBADMSG;
1759 } 1786 }
1760 if (datacrc && 1787 if (do_datacrc &&
1761 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && 1788 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1762 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { 1789 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1763 pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 1790 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
@@ -1819,7 +1846,6 @@ more:
1819 1846
1820 /* open the socket first? */ 1847 /* open the socket first? */
1821 if (con->sock == NULL) { 1848 if (con->sock == NULL) {
1822 prepare_write_banner(msgr, con);
1823 prepare_write_connect(msgr, con, 1); 1849 prepare_write_connect(msgr, con, 1);
1824 prepare_read_banner(con); 1850 prepare_read_banner(con);
1825 set_bit(CONNECTING, &con->state); 1851 set_bit(CONNECTING, &con->state);
@@ -1829,11 +1855,9 @@ more:
1829 con->in_tag = CEPH_MSGR_TAG_READY; 1855 con->in_tag = CEPH_MSGR_TAG_READY;
1830 dout("try_write initiating connect on %p new state %lu\n", 1856 dout("try_write initiating connect on %p new state %lu\n",
1831 con, con->state); 1857 con, con->state);
1832 con->sock = ceph_tcp_connect(con); 1858 ret = ceph_tcp_connect(con);
1833 if (IS_ERR(con->sock)) { 1859 if (ret < 0) {
1834 con->sock = NULL;
1835 con->error_msg = "connect error"; 1860 con->error_msg = "connect error";
1836 ret = -1;
1837 goto out; 1861 goto out;
1838 } 1862 }
1839 } 1863 }
@@ -1953,8 +1977,9 @@ more:
1953 * 1977 *
1954 * FIXME: there must be a better way to do this! 1978 * FIXME: there must be a better way to do this!
1955 */ 1979 */
1956 static char buf[1024]; 1980 static char buf[SKIP_BUF_SIZE];
1957 int skip = min(1024, -con->in_base_pos); 1981 int skip = min((int) sizeof (buf), -con->in_base_pos);
1982
1958 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); 1983 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1959 ret = ceph_tcp_recvmsg(con->sock, buf, skip); 1984 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1960 if (ret <= 0) 1985 if (ret <= 0)
@@ -2216,15 +2241,6 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2216 2241
2217 spin_lock_init(&msgr->global_seq_lock); 2242 spin_lock_init(&msgr->global_seq_lock);
2218 2243
2219 /* the zero page is needed if a request is "canceled" while the message
2220 * is being written over the socket */
2221 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
2222 if (!msgr->zero_page) {
2223 kfree(msgr);
2224 return ERR_PTR(-ENOMEM);
2225 }
2226 kmap(msgr->zero_page);
2227
2228 if (myaddr) 2244 if (myaddr)
2229 msgr->inst.addr = *myaddr; 2245 msgr->inst.addr = *myaddr;
2230 2246
@@ -2241,8 +2257,6 @@ EXPORT_SYMBOL(ceph_messenger_create);
2241void ceph_messenger_destroy(struct ceph_messenger *msgr) 2257void ceph_messenger_destroy(struct ceph_messenger *msgr)
2242{ 2258{
2243 dout("destroy %p\n", msgr); 2259 dout("destroy %p\n", msgr);
2244 kunmap(msgr->zero_page);
2245 __free_page(msgr->zero_page);
2246 kfree(msgr); 2260 kfree(msgr);
2247 dout("destroyed messenger %p\n", msgr); 2261 dout("destroyed messenger %p\n", msgr);
2248} 2262}
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fd863fe76934..29ad46ec9dcf 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -283,7 +283,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
283 ceph_decode_32_safe(p, end, yes, bad); 283 ceph_decode_32_safe(p, end, yes, bad);
284#if BITS_PER_LONG == 32 284#if BITS_PER_LONG == 32
285 err = -EINVAL; 285 err = -EINVAL;
286 if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) 286 if (yes > (ULONG_MAX - sizeof(*r))
287 / sizeof(struct crush_rule_step))
287 goto bad; 288 goto bad;
288#endif 289#endif
289 r = c->rules[i] = kmalloc(sizeof(*r) + 290 r = c->rules[i] = kmalloc(sizeof(*r) +
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6eb656acdfe5..a690cae91cdd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -321,12 +321,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
321EXPORT_SYMBOL(__netdev_alloc_skb); 321EXPORT_SYMBOL(__netdev_alloc_skb);
322 322
323void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 323void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
324 int size) 324 int size, unsigned int truesize)
325{ 325{
326 skb_fill_page_desc(skb, i, page, off, size); 326 skb_fill_page_desc(skb, i, page, off, size);
327 skb->len += size; 327 skb->len += size;
328 skb->data_len += size; 328 skb->data_len += size;
329 skb->truesize += size; 329 skb->truesize += truesize;
330} 330}
331EXPORT_SYMBOL(skb_add_rx_frag); 331EXPORT_SYMBOL(skb_add_rx_frag);
332 332
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 24c456e8aa1d..496b62712fe8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2474,8 +2474,12 @@ static int rt6_fill_node(struct net *net,
2474 2474
2475 rcu_read_lock(); 2475 rcu_read_lock();
2476 n = dst_get_neighbour_noref(&rt->dst); 2476 n = dst_get_neighbour_noref(&rt->dst);
2477 if (n) 2477 if (n) {
2478 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key); 2478 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2479 rcu_read_unlock();
2480 goto nla_put_failure;
2481 }
2482 }
2479 rcu_read_unlock(); 2483 rcu_read_unlock();
2480 2484
2481 if (rt->dst.dev) 2485 if (rt->dst.dev)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7b48035826ee..cbdb754dbb10 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -768,8 +768,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
768 struct nf_conntrack_l3proto *l3proto, 768 struct nf_conntrack_l3proto *l3proto,
769 struct nf_conntrack_l4proto *l4proto, 769 struct nf_conntrack_l4proto *l4proto,
770 struct sk_buff *skb, 770 struct sk_buff *skb,
771 unsigned int dataoff, u32 hash, 771 unsigned int dataoff, u32 hash)
772 unsigned int *timeouts)
773{ 772{
774 struct nf_conn *ct; 773 struct nf_conn *ct;
775 struct nf_conn_help *help; 774 struct nf_conn_help *help;
@@ -777,6 +776,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
777 struct nf_conntrack_ecache *ecache; 776 struct nf_conntrack_ecache *ecache;
778 struct nf_conntrack_expect *exp; 777 struct nf_conntrack_expect *exp;
779 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; 778 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
779 struct nf_conn_timeout *timeout_ext;
780 unsigned int *timeouts;
780 781
781 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 782 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
782 pr_debug("Can't invert tuple.\n"); 783 pr_debug("Can't invert tuple.\n");
@@ -788,12 +789,21 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
788 if (IS_ERR(ct)) 789 if (IS_ERR(ct))
789 return (struct nf_conntrack_tuple_hash *)ct; 790 return (struct nf_conntrack_tuple_hash *)ct;
790 791
792 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
793 if (timeout_ext)
794 timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
795 else
796 timeouts = l4proto->get_timeouts(net);
797
791 if (!l4proto->new(ct, skb, dataoff, timeouts)) { 798 if (!l4proto->new(ct, skb, dataoff, timeouts)) {
792 nf_conntrack_free(ct); 799 nf_conntrack_free(ct);
793 pr_debug("init conntrack: can't track with proto module\n"); 800 pr_debug("init conntrack: can't track with proto module\n");
794 return NULL; 801 return NULL;
795 } 802 }
796 803
804 if (timeout_ext)
805 nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);
806
797 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 807 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
798 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 808 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
799 809
@@ -854,8 +864,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
854 struct nf_conntrack_l3proto *l3proto, 864 struct nf_conntrack_l3proto *l3proto,
855 struct nf_conntrack_l4proto *l4proto, 865 struct nf_conntrack_l4proto *l4proto,
856 int *set_reply, 866 int *set_reply,
857 enum ip_conntrack_info *ctinfo, 867 enum ip_conntrack_info *ctinfo)
858 unsigned int *timeouts)
859{ 868{
860 struct nf_conntrack_tuple tuple; 869 struct nf_conntrack_tuple tuple;
861 struct nf_conntrack_tuple_hash *h; 870 struct nf_conntrack_tuple_hash *h;
@@ -875,7 +884,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
875 h = __nf_conntrack_find_get(net, zone, &tuple, hash); 884 h = __nf_conntrack_find_get(net, zone, &tuple, hash);
876 if (!h) { 885 if (!h) {
877 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, 886 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
878 skb, dataoff, hash, timeouts); 887 skb, dataoff, hash);
879 if (!h) 888 if (!h)
880 return NULL; 889 return NULL;
881 if (IS_ERR(h)) 890 if (IS_ERR(h))
@@ -964,19 +973,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
964 goto out; 973 goto out;
965 } 974 }
966 975
967 /* Decide what timeout policy we want to apply to this flow. */
968 if (tmpl) {
969 timeout_ext = nf_ct_timeout_find(tmpl);
970 if (timeout_ext)
971 timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
972 else
973 timeouts = l4proto->get_timeouts(net);
974 } else
975 timeouts = l4proto->get_timeouts(net);
976
977 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, 976 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
978 l3proto, l4proto, &set_reply, &ctinfo, 977 l3proto, l4proto, &set_reply, &ctinfo);
979 timeouts);
980 if (!ct) { 978 if (!ct) {
981 /* Not valid part of a connection */ 979 /* Not valid part of a connection */
982 NF_CT_STAT_INC_ATOMIC(net, invalid); 980 NF_CT_STAT_INC_ATOMIC(net, invalid);
@@ -993,6 +991,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
993 991
994 NF_CT_ASSERT(skb->nfct); 992 NF_CT_ASSERT(skb->nfct);
995 993
994 /* Decide what timeout policy we want to apply to this flow. */
995 timeout_ext = nf_ct_timeout_find(ct);
996 if (timeout_ext)
997 timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
998 else
999 timeouts = l4proto->get_timeouts(net);
1000
996 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts); 1001 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
997 if (ret <= 0) { 1002 if (ret <= 0) {
998 /* Invalid: inverse of the return code tells 1003 /* Invalid: inverse of the return code tells
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5701c8dd783c..be3da2c8cdc5 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -127,6 +127,27 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
127} 127}
128EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); 128EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
129 129
130struct nf_conntrack_l4proto *
131nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
132{
133 struct nf_conntrack_l4proto *p;
134
135 rcu_read_lock();
136 p = __nf_ct_l4proto_find(l3num, l4num);
137 if (!try_module_get(p->me))
138 p = &nf_conntrack_l4proto_generic;
139 rcu_read_unlock();
140
141 return p;
142}
143EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get);
144
145void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p)
146{
147 module_put(p->me);
148}
149EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
150
130static int kill_l3proto(struct nf_conn *i, void *data) 151static int kill_l3proto(struct nf_conn *i, void *data)
131{ 152{
132 return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; 153 return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index fec29a43de4d..2b9e79f5ef05 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -98,11 +98,13 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
98 break; 98 break;
99 } 99 }
100 100
101 l4proto = __nf_ct_l4proto_find(l3num, l4num); 101 l4proto = nf_ct_l4proto_find_get(l3num, l4num);
102 102
103 /* This protocol is not supportted, skip. */ 103 /* This protocol is not supportted, skip. */
104 if (l4proto->l4proto != l4num) 104 if (l4proto->l4proto != l4num) {
105 return -EOPNOTSUPP; 105 ret = -EOPNOTSUPP;
106 goto err_proto_put;
107 }
106 108
107 if (matching) { 109 if (matching) {
108 if (nlh->nlmsg_flags & NLM_F_REPLACE) { 110 if (nlh->nlmsg_flags & NLM_F_REPLACE) {
@@ -110,20 +112,25 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
110 * different kind, sorry. 112 * different kind, sorry.
111 */ 113 */
112 if (matching->l3num != l3num || 114 if (matching->l3num != l3num ||
113 matching->l4num != l4num) 115 matching->l4proto->l4proto != l4num) {
114 return -EINVAL; 116 ret = -EINVAL;
117 goto err_proto_put;
118 }
115 119
116 ret = ctnl_timeout_parse_policy(matching, l4proto, 120 ret = ctnl_timeout_parse_policy(matching, l4proto,
117 cda[CTA_TIMEOUT_DATA]); 121 cda[CTA_TIMEOUT_DATA]);
118 return ret; 122 return ret;
119 } 123 }
120 return -EBUSY; 124 ret = -EBUSY;
125 goto err_proto_put;
121 } 126 }
122 127
123 timeout = kzalloc(sizeof(struct ctnl_timeout) + 128 timeout = kzalloc(sizeof(struct ctnl_timeout) +
124 l4proto->ctnl_timeout.obj_size, GFP_KERNEL); 129 l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
125 if (timeout == NULL) 130 if (timeout == NULL) {
126 return -ENOMEM; 131 ret = -ENOMEM;
132 goto err_proto_put;
133 }
127 134
128 ret = ctnl_timeout_parse_policy(timeout, l4proto, 135 ret = ctnl_timeout_parse_policy(timeout, l4proto,
129 cda[CTA_TIMEOUT_DATA]); 136 cda[CTA_TIMEOUT_DATA]);
@@ -132,13 +139,15 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
132 139
133 strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); 140 strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME]));
134 timeout->l3num = l3num; 141 timeout->l3num = l3num;
135 timeout->l4num = l4num; 142 timeout->l4proto = l4proto;
136 atomic_set(&timeout->refcnt, 1); 143 atomic_set(&timeout->refcnt, 1);
137 list_add_tail_rcu(&timeout->head, &cttimeout_list); 144 list_add_tail_rcu(&timeout->head, &cttimeout_list);
138 145
139 return 0; 146 return 0;
140err: 147err:
141 kfree(timeout); 148 kfree(timeout);
149err_proto_put:
150 nf_ct_l4proto_put(l4proto);
142 return ret; 151 return ret;
143} 152}
144 153
@@ -149,7 +158,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type,
149 struct nlmsghdr *nlh; 158 struct nlmsghdr *nlh;
150 struct nfgenmsg *nfmsg; 159 struct nfgenmsg *nfmsg;
151 unsigned int flags = pid ? NLM_F_MULTI : 0; 160 unsigned int flags = pid ? NLM_F_MULTI : 0;
152 struct nf_conntrack_l4proto *l4proto; 161 struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
153 162
154 event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; 163 event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
155 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); 164 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
@@ -163,20 +172,10 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type,
163 172
164 NLA_PUT_STRING(skb, CTA_TIMEOUT_NAME, timeout->name); 173 NLA_PUT_STRING(skb, CTA_TIMEOUT_NAME, timeout->name);
165 NLA_PUT_BE16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)); 174 NLA_PUT_BE16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num));
166 NLA_PUT_U8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4num); 175 NLA_PUT_U8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto);
167 NLA_PUT_BE32(skb, CTA_TIMEOUT_USE, 176 NLA_PUT_BE32(skb, CTA_TIMEOUT_USE,
168 htonl(atomic_read(&timeout->refcnt))); 177 htonl(atomic_read(&timeout->refcnt)));
169 178
170 l4proto = __nf_ct_l4proto_find(timeout->l3num, timeout->l4num);
171
172 /* If the timeout object does not match the layer 4 protocol tracker,
173 * then skip dumping the data part since we don't know how to
174 * interpret it. This may happen for UPDlite, SCTP and DCCP since
175 * you can unload the module.
176 */
177 if (timeout->l4num != l4proto->l4proto)
178 goto out;
179
180 if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { 179 if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
181 struct nlattr *nest_parms; 180 struct nlattr *nest_parms;
182 int ret; 181 int ret;
@@ -192,7 +191,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type,
192 191
193 nla_nest_end(skb, nest_parms); 192 nla_nest_end(skb, nest_parms);
194 } 193 }
195out: 194
196 nlmsg_end(skb, nlh); 195 nlmsg_end(skb, nlh);
197 return skb->len; 196 return skb->len;
198 197
@@ -293,6 +292,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
293 if (atomic_dec_and_test(&timeout->refcnt)) { 292 if (atomic_dec_and_test(&timeout->refcnt)) {
294 /* We are protected by nfnl mutex. */ 293 /* We are protected by nfnl mutex. */
295 list_del_rcu(&timeout->head); 294 list_del_rcu(&timeout->head);
295 nf_ct_l4proto_put(timeout->l4proto);
296 kfree_rcu(timeout, rcu_head); 296 kfree_rcu(timeout, rcu_head);
297 } else { 297 } else {
298 /* still in use, restore reference counter. */ 298 /* still in use, restore reference counter. */
@@ -417,6 +417,7 @@ static void __exit cttimeout_exit(void)
417 /* We are sure that our objects have no clients at this point, 417 /* We are sure that our objects have no clients at this point,
418 * it's safe to release them all without checking refcnt. 418 * it's safe to release them all without checking refcnt.
419 */ 419 */
420 nf_ct_l4proto_put(cur->l4proto);
420 kfree_rcu(cur, rcu_head); 421 kfree_rcu(cur, rcu_head);
421 } 422 }
422#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 423#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index b873445df444..0c8e43810ce3 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -14,8 +14,10 @@
14#include <linux/netfilter/x_tables.h> 14#include <linux/netfilter/x_tables.h>
15#include <linux/netfilter/xt_CT.h> 15#include <linux/netfilter/xt_CT.h>
16#include <net/netfilter/nf_conntrack.h> 16#include <net/netfilter/nf_conntrack.h>
17#include <net/netfilter/nf_conntrack_l4proto.h>
17#include <net/netfilter/nf_conntrack_helper.h> 18#include <net/netfilter/nf_conntrack_helper.h>
18#include <net/netfilter/nf_conntrack_ecache.h> 19#include <net/netfilter/nf_conntrack_ecache.h>
20#include <net/netfilter/nf_conntrack_l4proto.h>
19#include <net/netfilter/nf_conntrack_timeout.h> 21#include <net/netfilter/nf_conntrack_timeout.h>
20#include <net/netfilter/nf_conntrack_zones.h> 22#include <net/netfilter/nf_conntrack_zones.h>
21 23
@@ -217,50 +219,59 @@ static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par)
217 struct ctnl_timeout *timeout; 219 struct ctnl_timeout *timeout;
218 struct nf_conn_timeout *timeout_ext; 220 struct nf_conn_timeout *timeout_ext;
219 221
222 rcu_read_lock();
220 timeout_find_get = 223 timeout_find_get =
221 rcu_dereference(nf_ct_timeout_find_get_hook); 224 rcu_dereference(nf_ct_timeout_find_get_hook);
222 225
223 if (timeout_find_get) { 226 if (timeout_find_get) {
224 const struct ipt_entry *e = par->entryinfo; 227 const struct ipt_entry *e = par->entryinfo;
228 struct nf_conntrack_l4proto *l4proto;
225 229
226 if (e->ip.invflags & IPT_INV_PROTO) { 230 if (e->ip.invflags & IPT_INV_PROTO) {
227 ret = -EINVAL; 231 ret = -EINVAL;
228 pr_info("You cannot use inversion on " 232 pr_info("You cannot use inversion on "
229 "L4 protocol\n"); 233 "L4 protocol\n");
230 goto err3; 234 goto err4;
231 } 235 }
232 timeout = timeout_find_get(info->timeout); 236 timeout = timeout_find_get(info->timeout);
233 if (timeout == NULL) { 237 if (timeout == NULL) {
234 ret = -ENOENT; 238 ret = -ENOENT;
235 pr_info("No such timeout policy \"%s\"\n", 239 pr_info("No such timeout policy \"%s\"\n",
236 info->timeout); 240 info->timeout);
237 goto err3; 241 goto err4;
238 } 242 }
239 if (timeout->l3num != par->family) { 243 if (timeout->l3num != par->family) {
240 ret = -EINVAL; 244 ret = -EINVAL;
241 pr_info("Timeout policy `%s' can only be " 245 pr_info("Timeout policy `%s' can only be "
242 "used by L3 protocol number %d\n", 246 "used by L3 protocol number %d\n",
243 info->timeout, timeout->l3num); 247 info->timeout, timeout->l3num);
244 goto err3; 248 goto err4;
245 } 249 }
246 if (timeout->l4num != e->ip.proto) { 250 /* Make sure the timeout policy matches any existing
251 * protocol tracker, otherwise default to generic.
252 */
253 l4proto = __nf_ct_l4proto_find(par->family,
254 e->ip.proto);
255 if (timeout->l4proto->l4proto != l4proto->l4proto) {
247 ret = -EINVAL; 256 ret = -EINVAL;
248 pr_info("Timeout policy `%s' can only be " 257 pr_info("Timeout policy `%s' can only be "
249 "used by L4 protocol number %d\n", 258 "used by L4 protocol number %d\n",
250 info->timeout, timeout->l4num); 259 info->timeout,
251 goto err3; 260 timeout->l4proto->l4proto);
261 goto err4;
252 } 262 }
253 timeout_ext = nf_ct_timeout_ext_add(ct, timeout, 263 timeout_ext = nf_ct_timeout_ext_add(ct, timeout,
254 GFP_KERNEL); 264 GFP_KERNEL);
255 if (timeout_ext == NULL) { 265 if (timeout_ext == NULL) {
256 ret = -ENOMEM; 266 ret = -ENOMEM;
257 goto err3; 267 goto err4;
258 } 268 }
259 } else { 269 } else {
260 ret = -ENOENT; 270 ret = -ENOENT;
261 pr_info("Timeout policy base is empty\n"); 271 pr_info("Timeout policy base is empty\n");
262 goto err3; 272 goto err4;
263 } 273 }
274 rcu_read_unlock();
264 } 275 }
265#endif 276#endif
266 277
@@ -270,6 +281,8 @@ out:
270 info->ct = ct; 281 info->ct = ct;
271 return 0; 282 return 0;
272 283
284err4:
285 rcu_read_unlock();
273err3: 286err3:
274 nf_conntrack_free(ct); 287 nf_conntrack_free(ct);
275err2: 288err2:
@@ -311,6 +324,7 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
311 nf_ct_l3proto_module_put(par->family); 324 nf_ct_l3proto_module_put(par->family);
312 325
313#ifdef CONFIG_NF_CONNTRACK_TIMEOUT 326#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
327 rcu_read_lock();
314 timeout_put = rcu_dereference(nf_ct_timeout_put_hook); 328 timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
315 329
316 if (timeout_put) { 330 if (timeout_put) {
@@ -318,6 +332,7 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
318 if (timeout_ext) 332 if (timeout_ext)
319 timeout_put(timeout_ext->timeout); 333 timeout_put(timeout_ext->timeout);
320 } 334 }
335 rcu_read_unlock();
321#endif 336#endif
322 } 337 }
323 nf_ct_put(info->ct); 338 nf_ct_put(info->ct);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index f99f8dee238b..ff5f75fddb15 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -480,7 +480,7 @@ ipt_log_packet(u_int8_t pf,
480 sb_close(m); 480 sb_close(m);
481} 481}
482 482
483#if IS_ENABLED(CONFIG_IPV6) 483#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
484/* One level of recursion won't kill us */ 484/* One level of recursion won't kill us */
485static void dump_ipv6_packet(struct sbuff *m, 485static void dump_ipv6_packet(struct sbuff *m,
486 const struct nf_loginfo *info, 486 const struct nf_loginfo *info,
@@ -824,7 +824,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
824 if (par->family == NFPROTO_IPV4) 824 if (par->family == NFPROTO_IPV4)
825 ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, 825 ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in,
826 par->out, &li, loginfo->prefix); 826 par->out, &li, loginfo->prefix);
827#if IS_ENABLED(CONFIG_IPV6) 827#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
828 else if (par->family == NFPROTO_IPV6) 828 else if (par->family == NFPROTO_IPV6)
829 ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, 829 ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in,
830 par->out, &li, loginfo->prefix); 830 par->out, &li, loginfo->prefix);
@@ -864,7 +864,7 @@ static struct xt_target log_tg_regs[] __read_mostly = {
864 .checkentry = log_tg_check, 864 .checkentry = log_tg_check,
865 .me = THIS_MODULE, 865 .me = THIS_MODULE,
866 }, 866 },
867#if IS_ENABLED(CONFIG_IPV6) 867#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
868 { 868 {
869 .name = "LOG", 869 .name = "LOG",
870 .family = NFPROTO_IPV6, 870 .family = NFPROTO_IPV6,
@@ -882,7 +882,7 @@ static struct nf_logger ipt_log_logger __read_mostly = {
882 .me = THIS_MODULE, 882 .me = THIS_MODULE,
883}; 883};
884 884
885#if IS_ENABLED(CONFIG_IPV6) 885#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
886static struct nf_logger ip6t_log_logger __read_mostly = { 886static struct nf_logger ip6t_log_logger __read_mostly = {
887 .name = "ip6t_LOG", 887 .name = "ip6t_LOG",
888 .logfn = &ip6t_log_packet, 888 .logfn = &ip6t_log_packet,
@@ -899,7 +899,7 @@ static int __init log_tg_init(void)
899 return ret; 899 return ret;
900 900
901 nf_log_register(NFPROTO_IPV4, &ipt_log_logger); 901 nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
902#if IS_ENABLED(CONFIG_IPV6) 902#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
903 nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); 903 nf_log_register(NFPROTO_IPV6, &ip6t_log_logger);
904#endif 904#endif
905 return 0; 905 return 0;
@@ -908,7 +908,7 @@ static int __init log_tg_init(void)
908static void __exit log_tg_exit(void) 908static void __exit log_tg_exit(void)
909{ 909{
910 nf_log_unregister(&ipt_log_logger); 910 nf_log_unregister(&ipt_log_logger);
911#if IS_ENABLED(CONFIG_IPV6) 911#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
912 nf_log_unregister(&ip6t_log_logger); 912 nf_log_unregister(&ip6t_log_logger);
913#endif 913#endif
914 xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); 914 xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));