aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2013-11-08 16:03:50 -0500
committerTrond Myklebust <Trond.Myklebust@netapp.com>2013-11-08 17:19:15 -0500
commita6b31d18b02ff9d7915c5898c9b5ca41a798cd73 (patch)
tree14316d0663c3b55858f38629697e81286e5ebce7 /net/sunrpc
parentfab99ebe39fe7d11fbd9b5fb84f07432af9ba36f (diff)
SUNRPC: Fix a data corruption issue when retransmitting RPC calls
The following scenario can cause silent data corruption when doing NFS writes. It has mainly been observed when doing database writes using O_DIRECT. 1) The RPC client uses sendpage() to do zero-copy of the page data. 2) Due to networking issues, the reply from the server is delayed, and so the RPC client times out. 3) The client issues a second sendpage of the page data as part of an RPC call retransmission. 4) The reply to the first transmission arrives from the server _before_ the client hardware has emptied the TCP socket send buffer. 5) After processing the reply, the RPC state machine rules that the call to be done, and triggers the completion callbacks. 6) The application notices the RPC call is done, and reuses the pages to store something else (e.g. a new write). 7) The client NIC drains the TCP socket send buffer. Since the page data has now changed, it reads a corrupted version of the initial RPC call, and puts it on the wire. This patch fixes the problem in the following manner: The ordering guarantees of TCP ensure that when the server sends a reply, then we know that the _first_ transmission has completed. Using zero-copy in that situation is therefore safe. If a time out occurs, we then send the retransmission using sendmsg() (i.e. no zero-copy), We then know that the socket contains a full copy of the data, and so it will retransmit a faithful reproduction even if the RPC call completes, and the application reuses the O_DIRECT buffer in the meantime. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: stable@vger.kernel.org
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/xprtsock.c28
1 files changed, 21 insertions, 7 deletions
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 17c88928b7db..dd9d295813cf 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
393 return kernel_sendmsg(sock, &msg, NULL, 0, 0); 393 return kernel_sendmsg(sock, &msg, NULL, 0, 0);
394} 394}
395 395
396static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more) 396static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
397{ 397{
398 ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
399 int offset, size_t size, int flags);
398 struct page **ppage; 400 struct page **ppage;
399 unsigned int remainder; 401 unsigned int remainder;
400 int err, sent = 0; 402 int err, sent = 0;
@@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
403 base += xdr->page_base; 405 base += xdr->page_base;
404 ppage = xdr->pages + (base >> PAGE_SHIFT); 406 ppage = xdr->pages + (base >> PAGE_SHIFT);
405 base &= ~PAGE_MASK; 407 base &= ~PAGE_MASK;
408 do_sendpage = sock->ops->sendpage;
409 if (!zerocopy)
410 do_sendpage = sock_no_sendpage;
406 for(;;) { 411 for(;;) {
407 unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder); 412 unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
408 int flags = XS_SENDMSG_FLAGS; 413 int flags = XS_SENDMSG_FLAGS;
@@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
410 remainder -= len; 415 remainder -= len;
411 if (remainder != 0 || more) 416 if (remainder != 0 || more)
412 flags |= MSG_MORE; 417 flags |= MSG_MORE;
413 err = sock->ops->sendpage(sock, *ppage, base, len, flags); 418 err = do_sendpage(sock, *ppage, base, len, flags);
414 if (remainder == 0 || err != len) 419 if (remainder == 0 || err != len)
415 break; 420 break;
416 sent += err; 421 sent += err;
@@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
431 * @addrlen: UDP only -- length of destination address 436 * @addrlen: UDP only -- length of destination address
432 * @xdr: buffer containing this request 437 * @xdr: buffer containing this request
433 * @base: starting position in the buffer 438 * @base: starting position in the buffer
439 * @zerocopy: true if it is safe to use sendpage()
434 * 440 *
435 */ 441 */
436static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) 442static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
437{ 443{
438 unsigned int remainder = xdr->len - base; 444 unsigned int remainder = xdr->len - base;
439 int err, sent = 0; 445 int err, sent = 0;
@@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
461 if (base < xdr->page_len) { 467 if (base < xdr->page_len) {
462 unsigned int len = xdr->page_len - base; 468 unsigned int len = xdr->page_len - base;
463 remainder -= len; 469 remainder -= len;
464 err = xs_send_pagedata(sock, xdr, base, remainder != 0); 470 err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
465 if (remainder == 0 || err != len) 471 if (remainder == 0 || err != len)
466 goto out; 472 goto out;
467 sent += err; 473 sent += err;
@@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
564 req->rq_svec->iov_base, req->rq_svec->iov_len); 570 req->rq_svec->iov_base, req->rq_svec->iov_len);
565 571
566 status = xs_sendpages(transport->sock, NULL, 0, 572 status = xs_sendpages(transport->sock, NULL, 0,
567 xdr, req->rq_bytes_sent); 573 xdr, req->rq_bytes_sent, true);
568 dprintk("RPC: %s(%u) = %d\n", 574 dprintk("RPC: %s(%u) = %d\n",
569 __func__, xdr->len - req->rq_bytes_sent, status); 575 __func__, xdr->len - req->rq_bytes_sent, status);
570 if (likely(status >= 0)) { 576 if (likely(status >= 0)) {
@@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
620 status = xs_sendpages(transport->sock, 626 status = xs_sendpages(transport->sock,
621 xs_addr(xprt), 627 xs_addr(xprt),
622 xprt->addrlen, xdr, 628 xprt->addrlen, xdr,
623 req->rq_bytes_sent); 629 req->rq_bytes_sent, true);
624 630
625 dprintk("RPC: xs_udp_send_request(%u) = %d\n", 631 dprintk("RPC: xs_udp_send_request(%u) = %d\n",
626 xdr->len - req->rq_bytes_sent, status); 632 xdr->len - req->rq_bytes_sent, status);
@@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
693 struct rpc_xprt *xprt = req->rq_xprt; 699 struct rpc_xprt *xprt = req->rq_xprt;
694 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 700 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
695 struct xdr_buf *xdr = &req->rq_snd_buf; 701 struct xdr_buf *xdr = &req->rq_snd_buf;
702 bool zerocopy = true;
696 int status; 703 int status;
697 704
698 xs_encode_stream_record_marker(&req->rq_snd_buf); 705 xs_encode_stream_record_marker(&req->rq_snd_buf);
@@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
700 xs_pktdump("packet data:", 707 xs_pktdump("packet data:",
701 req->rq_svec->iov_base, 708 req->rq_svec->iov_base,
702 req->rq_svec->iov_len); 709 req->rq_svec->iov_len);
710 /* Don't use zero copy if this is a resend. If the RPC call
711 * completes while the socket holds a reference to the pages,
712 * then we may end up resending corrupted data.
713 */
714 if (task->tk_flags & RPC_TASK_SENT)
715 zerocopy = false;
703 716
704 /* Continue transmitting the packet/record. We must be careful 717 /* Continue transmitting the packet/record. We must be careful
705 * to cope with writespace callbacks arriving _after_ we have 718 * to cope with writespace callbacks arriving _after_ we have
706 * called sendmsg(). */ 719 * called sendmsg(). */
707 while (1) { 720 while (1) {
708 status = xs_sendpages(transport->sock, 721 status = xs_sendpages(transport->sock,
709 NULL, 0, xdr, req->rq_bytes_sent); 722 NULL, 0, xdr, req->rq_bytes_sent,
723 zerocopy);
710 724
711 dprintk("RPC: xs_tcp_send_request(%u) = %d\n", 725 dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
712 xdr->len - req->rq_bytes_sent, status); 726 xdr->len - req->rq_bytes_sent, status);