summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Layton <jlayton@poochiereds.net>2016-05-17 12:28:47 -0400
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2016-05-17 15:48:12 -0400
commit183d9e7b112aaed0d19c16ffcf0f8c3a86dc71e0 (patch)
treee78c3b12f7a309b7a364dff4ebcab69fab66cfae
parent83026d80a16ea6a4e4f06a994fc7831b1d8d6375 (diff)
pnfs: rework LAYOUTGET retry handling
There are several problems in the way a stateid is selected for a LAYOUTGET operation: We pick a stateid to use in the RPC prepare op, but that makes it difficult to serialize LAYOUTGETs that use the open stateid. That serialization is done in pnfs_update_layout, which occurs well before the rpc_prepare operation. Between those two events, the i_lock is dropped and reacquired. pnfs_update_layout can find that the list has lsegs in it and not do any serialization, but then later pnfs_choose_layoutget_stateid ends up choosing the open stateid. This patch changes the client to select the stateid to use in the LAYOUTGET earlier, when we're searching for a usable layout segment. This way we can do it all while holding the i_lock the first time, and ensure that we serialize any LAYOUTGET call that uses a non-layout stateid. This also means a rework of how LAYOUTGET replies are handled, as we must now get the latest stateid if we want to retransmit in response to a retryable error. Most of those errors boil down to the fact that the layout state has changed in some fashion. Thus, what we really want to do is to re-search for a layout when it fails with a retryable error, so that we can avoid reissuing the RPC at all if possible. While the LAYOUTGET RPC is async, the initiating thread always waits for it to complete, so it's effectively synchronous anyway. Currently, when we need to retry a LAYOUTGET because of an error, we drive that retry via the rpc state machine. This means that once the call has been submitted, it runs until it completes. So, we must move the error handling for this RPC out of the rpc_call_done operation and into the caller. In order to handle errors like NFS4ERR_DELAY properly, we must also pass a pointer to the sliding timeout, which is now moved to the stack in pnfs_update_layout. The complicating errors are -NFS4ERR_RECALLCONFLICT and -NFS4ERR_LAYOUTTRYLATER, as those involve a timeout after which we give up and return NULL back to the caller. So, there is some special handling for those errors to ensure that the layers driving the retries can handle that appropriately. Signed-off-by: Jeff Layton <jeff.layton@primarydata.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
-rw-r--r--fs/nfs/nfs4proc.c115
-rw-r--r--fs/nfs/nfs4trace.h10
-rw-r--r--fs/nfs/pnfs.c144
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--include/linux/errno.h1
-rw-r--r--include/linux/nfs4.h2
-rw-r--r--include/linux/nfs_xdr.h2
7 files changed, 136 insertions, 144 deletions
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2a29f5d12aeb..62d67f040906 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -427,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
427 case -NFS4ERR_DELAY: 427 case -NFS4ERR_DELAY:
428 nfs_inc_server_stats(server, NFSIOS_DELAY); 428 nfs_inc_server_stats(server, NFSIOS_DELAY);
429 case -NFS4ERR_GRACE: 429 case -NFS4ERR_GRACE:
430 case -NFS4ERR_RECALLCONFLICT:
430 exception->delay = 1; 431 exception->delay = 1;
431 return 0; 432 return 0;
432 433
@@ -7847,40 +7848,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
7847 struct nfs4_layoutget *lgp = calldata; 7848 struct nfs4_layoutget *lgp = calldata;
7848 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 7849 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
7849 struct nfs4_session *session = nfs4_get_session(server); 7850 struct nfs4_session *session = nfs4_get_session(server);
7850 int ret;
7851 7851
7852 dprintk("--> %s\n", __func__); 7852 dprintk("--> %s\n", __func__);
7853 /* Note the is a race here, where a CB_LAYOUTRECALL can come in 7853 nfs41_setup_sequence(session, &lgp->args.seq_args,
7854 * right now covering the LAYOUTGET we are about to send. 7854 &lgp->res.seq_res, task);
7855 * However, that is not so catastrophic, and there seems 7855 dprintk("<-- %s\n", __func__);
7856 * to be no way to prevent it completely.
7857 */
7858 if (nfs41_setup_sequence(session, &lgp->args.seq_args,
7859 &lgp->res.seq_res, task))
7860 return;
7861 ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
7862 NFS_I(lgp->args.inode)->layout,
7863 &lgp->args.range,
7864 lgp->args.ctx->state);
7865 if (ret < 0)
7866 rpc_exit(task, ret);
7867} 7856}
7868 7857
7869static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) 7858static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7870{ 7859{
7871 struct nfs4_layoutget *lgp = calldata; 7860 struct nfs4_layoutget *lgp = calldata;
7861
7862 dprintk("--> %s\n", __func__);
7863 nfs41_sequence_done(task, &lgp->res.seq_res);
7864 dprintk("<-- %s\n", __func__);
7865}
7866
7867static int
7868nfs4_layoutget_handle_exception(struct rpc_task *task,
7869 struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
7870{
7872 struct inode *inode = lgp->args.inode; 7871 struct inode *inode = lgp->args.inode;
7873 struct nfs_server *server = NFS_SERVER(inode); 7872 struct nfs_server *server = NFS_SERVER(inode);
7874 struct pnfs_layout_hdr *lo; 7873 struct pnfs_layout_hdr *lo;
7875 struct nfs4_state *state = NULL; 7874 int status = task->tk_status;
7876 unsigned long timeo, now, giveup;
7877 7875
7878 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); 7876 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
7879 7877
7880 if (!nfs41_sequence_done(task, &lgp->res.seq_res)) 7878 switch (status) {
7881 goto out;
7882
7883 switch (task->tk_status) {
7884 case 0: 7879 case 0:
7885 goto out; 7880 goto out;
7886 7881
@@ -7890,57 +7885,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7890 * retry go inband. 7885 * retry go inband.
7891 */ 7886 */
7892 case -NFS4ERR_LAYOUTUNAVAILABLE: 7887 case -NFS4ERR_LAYOUTUNAVAILABLE:
7893 task->tk_status = -ENODATA; 7888 status = -ENODATA;
7894 goto out; 7889 goto out;
7895 /* 7890 /*
7896 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of 7891 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
7897 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). 7892 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
7898 */ 7893 */
7899 case -NFS4ERR_BADLAYOUT: 7894 case -NFS4ERR_BADLAYOUT:
7900 goto out_overflow; 7895 status = -EOVERFLOW;
7896 goto out;
7901 /* 7897 /*
7902 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client 7898 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
7903 * (or clients) writing to the same RAID stripe except when 7899 * (or clients) writing to the same RAID stripe except when
7904 * the minlength argument is 0 (see RFC5661 section 18.43.3). 7900 * the minlength argument is 0 (see RFC5661 section 18.43.3).
7901 *
7902 * Treat it like we would RECALLCONFLICT -- we retry for a little
7903 * while, and then eventually give up.
7905 */ 7904 */
7906 case -NFS4ERR_LAYOUTTRYLATER: 7905 case -NFS4ERR_LAYOUTTRYLATER:
7907 if (lgp->args.minlength == 0) 7906 if (lgp->args.minlength == 0) {
7908 goto out_overflow; 7907 status = -EOVERFLOW;
7909 /* 7908 goto out;
7910 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
7911 * existing layout before getting a new one).
7912 */
7913 case -NFS4ERR_RECALLCONFLICT:
7914 timeo = rpc_get_timeout(task->tk_client);
7915 giveup = lgp->args.timestamp + timeo;
7916 now = jiffies;
7917 if (time_after(giveup, now)) {
7918 unsigned long delay;
7919
7920 /* Delay for:
7921 * - Not less then NFS4_POLL_RETRY_MIN.
7922 * - One last time a jiffie before we give up
7923 * - exponential backoff (time_now minus start_attempt)
7924 */
7925 delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
7926 min((giveup - now - 1),
7927 now - lgp->args.timestamp));
7928
7929 dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
7930 __func__, delay);
7931 rpc_delay(task, delay);
7932 /* Do not call nfs4_async_handle_error() */
7933 goto out_restart;
7934 } 7909 }
7935 break; 7910 /* Fallthrough */
7911 case -NFS4ERR_RECALLCONFLICT:
7912 nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
7913 exception);
7914 status = -ERECALLCONFLICT;
7915 goto out;
7936 case -NFS4ERR_EXPIRED: 7916 case -NFS4ERR_EXPIRED:
7937 case -NFS4ERR_BAD_STATEID: 7917 case -NFS4ERR_BAD_STATEID:
7918 exception->timeout = 0;
7938 spin_lock(&inode->i_lock); 7919 spin_lock(&inode->i_lock);
7939 if (nfs4_stateid_match(&lgp->args.stateid, 7920 if (nfs4_stateid_match(&lgp->args.stateid,
7940 &lgp->args.ctx->state->stateid)) { 7921 &lgp->args.ctx->state->stateid)) {
7941 spin_unlock(&inode->i_lock); 7922 spin_unlock(&inode->i_lock);
7942 /* If the open stateid was bad, then recover it. */ 7923 /* If the open stateid was bad, then recover it. */
7943 state = lgp->args.ctx->state; 7924 exception->state = lgp->args.ctx->state;
7944 break; 7925 break;
7945 } 7926 }
7946 lo = NFS_I(inode)->layout; 7927 lo = NFS_I(inode)->layout;
@@ -7958,20 +7939,16 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7958 pnfs_free_lseg_list(&head); 7939 pnfs_free_lseg_list(&head);
7959 } else 7940 } else
7960 spin_unlock(&inode->i_lock); 7941 spin_unlock(&inode->i_lock);
7961 goto out_restart; 7942 status = -EAGAIN;
7943 goto out;
7962 } 7944 }
7963 if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) 7945
7964 goto out_restart; 7946 status = nfs4_handle_exception(server, status, exception);
7947 if (exception->retry)
7948 status = -EAGAIN;
7965out: 7949out:
7966 dprintk("<-- %s\n", __func__); 7950 dprintk("<-- %s\n", __func__);
7967 return; 7951 return status;
7968out_restart:
7969 task->tk_status = 0;
7970 rpc_restart_call_prepare(task);
7971 return;
7972out_overflow:
7973 task->tk_status = -EOVERFLOW;
7974 goto out;
7975} 7952}
7976 7953
7977static size_t max_response_pages(struct nfs_server *server) 7954static size_t max_response_pages(struct nfs_server *server)
@@ -8040,7 +8017,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
8040}; 8017};
8041 8018
8042struct pnfs_layout_segment * 8019struct pnfs_layout_segment *
8043nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) 8020nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
8044{ 8021{
8045 struct inode *inode = lgp->args.inode; 8022 struct inode *inode = lgp->args.inode;
8046 struct nfs_server *server = NFS_SERVER(inode); 8023 struct nfs_server *server = NFS_SERVER(inode);
@@ -8060,6 +8037,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
8060 .flags = RPC_TASK_ASYNC, 8037 .flags = RPC_TASK_ASYNC,
8061 }; 8038 };
8062 struct pnfs_layout_segment *lseg = NULL; 8039 struct pnfs_layout_segment *lseg = NULL;
8040 struct nfs4_exception exception = { .timeout = *timeout };
8063 int status = 0; 8041 int status = 0;
8064 8042
8065 dprintk("--> %s\n", __func__); 8043 dprintk("--> %s\n", __func__);
@@ -8073,7 +8051,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
8073 return ERR_PTR(-ENOMEM); 8051 return ERR_PTR(-ENOMEM);
8074 } 8052 }
8075 lgp->args.layout.pglen = max_pages * PAGE_SIZE; 8053 lgp->args.layout.pglen = max_pages * PAGE_SIZE;
8076 lgp->args.timestamp = jiffies;
8077 8054
8078 lgp->res.layoutp = &lgp->args.layout; 8055 lgp->res.layoutp = &lgp->args.layout;
8079 lgp->res.seq_res.sr_slot = NULL; 8056 lgp->res.seq_res.sr_slot = NULL;
@@ -8083,13 +8060,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
8083 if (IS_ERR(task)) 8060 if (IS_ERR(task))
8084 return ERR_CAST(task); 8061 return ERR_CAST(task);
8085 status = nfs4_wait_for_completion_rpc_task(task); 8062 status = nfs4_wait_for_completion_rpc_task(task);
8086 if (status == 0) 8063 if (status == 0) {
8087 status = task->tk_status; 8064 status = nfs4_layoutget_handle_exception(task, lgp, &exception);
8065 *timeout = exception.timeout;
8066 }
8067
8088 trace_nfs4_layoutget(lgp->args.ctx, 8068 trace_nfs4_layoutget(lgp->args.ctx,
8089 &lgp->args.range, 8069 &lgp->args.range,
8090 &lgp->res.range, 8070 &lgp->res.range,
8091 &lgp->res.stateid, 8071 &lgp->res.stateid,
8092 status); 8072 status);
8073
8093 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ 8074 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
8094 if (status == 0 && lgp->res.layoutp->len) 8075 if (status == 0 && lgp->res.layoutp->len)
8095 lseg = pnfs_layout_process(lgp); 8076 lseg = pnfs_layout_process(lgp);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2c8d05dae5b1..9c150b153782 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
1520 { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ 1520 { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
1521 { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ 1521 { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
1522 { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ 1522 { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
1523 { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \
1524 { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \
1523 { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) 1525 { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
1524 1526
1525TRACE_EVENT(pnfs_update_layout, 1527TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
1528 u64 count, 1530 u64 count,
1529 enum pnfs_iomode iomode, 1531 enum pnfs_iomode iomode,
1530 struct pnfs_layout_hdr *lo, 1532 struct pnfs_layout_hdr *lo,
1533 struct pnfs_layout_segment *lseg,
1531 enum pnfs_update_layout_reason reason 1534 enum pnfs_update_layout_reason reason
1532 ), 1535 ),
1533 TP_ARGS(inode, pos, count, iomode, lo, reason), 1536 TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
1534 TP_STRUCT__entry( 1537 TP_STRUCT__entry(
1535 __field(dev_t, dev) 1538 __field(dev_t, dev)
1536 __field(u64, fileid) 1539 __field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
1540 __field(enum pnfs_iomode, iomode) 1543 __field(enum pnfs_iomode, iomode)
1541 __field(int, layoutstateid_seq) 1544 __field(int, layoutstateid_seq)
1542 __field(u32, layoutstateid_hash) 1545 __field(u32, layoutstateid_hash)
1546 __field(long, lseg)
1543 __field(enum pnfs_update_layout_reason, reason) 1547 __field(enum pnfs_update_layout_reason, reason)
1544 ), 1548 ),
1545 TP_fast_assign( 1549 TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
1559 __entry->layoutstateid_seq = 0; 1563 __entry->layoutstateid_seq = 0;
1560 __entry->layoutstateid_hash = 0; 1564 __entry->layoutstateid_hash = 0;
1561 } 1565 }
1566 __entry->lseg = (long)lseg;
1562 ), 1567 ),
1563 TP_printk( 1568 TP_printk(
1564 "fileid=%02x:%02x:%llu fhandle=0x%08x " 1569 "fileid=%02x:%02x:%llu fhandle=0x%08x "
1565 "iomode=%s pos=%llu count=%llu " 1570 "iomode=%s pos=%llu count=%llu "
1566 "layoutstateid=%d:0x%08x (%s)", 1571 "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
1567 MAJOR(__entry->dev), MINOR(__entry->dev), 1572 MAJOR(__entry->dev), MINOR(__entry->dev),
1568 (unsigned long long)__entry->fileid, 1573 (unsigned long long)__entry->fileid,
1569 __entry->fhandle, 1574 __entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
1571 (unsigned long long)__entry->pos, 1576 (unsigned long long)__entry->pos,
1572 (unsigned long long)__entry->count, 1577 (unsigned long long)__entry->count,
1573 __entry->layoutstateid_seq, __entry->layoutstateid_hash, 1578 __entry->layoutstateid_seq, __entry->layoutstateid_hash,
1579 __entry->lseg,
1574 show_pnfs_update_layout_reason(__entry->reason) 1580 show_pnfs_update_layout_reason(__entry->reason)
1575 ) 1581 )
1576); 1582);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2a609af845fe..46339a7fb191 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -796,45 +796,18 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
796 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 796 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
797} 797}
798 798
799int
800pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
801 const struct pnfs_layout_range *range,
802 struct nfs4_state *open_state)
803{
804 int status = 0;
805
806 dprintk("--> %s\n", __func__);
807 spin_lock(&lo->plh_inode->i_lock);
808 if (pnfs_layoutgets_blocked(lo)) {
809 status = -EAGAIN;
810 } else if (!nfs4_valid_open_stateid(open_state)) {
811 status = -EBADF;
812 } else if (list_empty(&lo->plh_segs) ||
813 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
814 int seq;
815
816 do {
817 seq = read_seqbegin(&open_state->seqlock);
818 nfs4_stateid_copy(dst, &open_state->stateid);
819 } while (read_seqretry(&open_state->seqlock, seq));
820 } else
821 nfs4_stateid_copy(dst, &lo->plh_stateid);
822 spin_unlock(&lo->plh_inode->i_lock);
823 dprintk("<-- %s\n", __func__);
824 return status;
825}
826
827/* 799/*
828* Get layout from server. 800 * Get layout from server.
829* for now, assume that whole file layouts are requested. 801 * for now, assume that whole file layouts are requested.
830* arg->offset: 0 802 * arg->offset: 0
831* arg->length: all ones 803 * arg->length: all ones
832*/ 804 */
833static struct pnfs_layout_segment * 805static struct pnfs_layout_segment *
834send_layoutget(struct pnfs_layout_hdr *lo, 806send_layoutget(struct pnfs_layout_hdr *lo,
835 struct nfs_open_context *ctx, 807 struct nfs_open_context *ctx,
808 nfs4_stateid *stateid,
836 const struct pnfs_layout_range *range, 809 const struct pnfs_layout_range *range,
837 gfp_t gfp_flags) 810 long *timeout, gfp_t gfp_flags)
838{ 811{
839 struct inode *ino = lo->plh_inode; 812 struct inode *ino = lo->plh_inode;
840 struct nfs_server *server = NFS_SERVER(ino); 813 struct nfs_server *server = NFS_SERVER(ino);
@@ -868,10 +841,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
868 lgp->args.type = server->pnfs_curr_ld->id; 841 lgp->args.type = server->pnfs_curr_ld->id;
869 lgp->args.inode = ino; 842 lgp->args.inode = ino;
870 lgp->args.ctx = get_nfs_open_context(ctx); 843 lgp->args.ctx = get_nfs_open_context(ctx);
844 nfs4_stateid_copy(&lgp->args.stateid, stateid);
871 lgp->gfp_flags = gfp_flags; 845 lgp->gfp_flags = gfp_flags;
872 lgp->cred = lo->plh_lc_cred; 846 lgp->cred = lo->plh_lc_cred;
873 847
874 return nfs4_proc_layoutget(lgp, gfp_flags); 848 return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
875} 849}
876 850
877static void pnfs_clear_layoutcommit(struct inode *inode, 851static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -1511,27 +1485,30 @@ pnfs_update_layout(struct inode *ino,
1511 .offset = pos, 1485 .offset = pos,
1512 .length = count, 1486 .length = count,
1513 }; 1487 };
1514 unsigned pg_offset; 1488 unsigned pg_offset, seq;
1515 struct nfs_server *server = NFS_SERVER(ino); 1489 struct nfs_server *server = NFS_SERVER(ino);
1516 struct nfs_client *clp = server->nfs_client; 1490 struct nfs_client *clp = server->nfs_client;
1517 struct pnfs_layout_hdr *lo; 1491 struct pnfs_layout_hdr *lo = NULL;
1518 struct pnfs_layout_segment *lseg = NULL; 1492 struct pnfs_layout_segment *lseg = NULL;
1493 nfs4_stateid stateid;
1494 long timeout = 0;
1495 unsigned long giveup = jiffies + rpc_get_timeout(server->client);
1519 bool first; 1496 bool first;
1520 1497
1521 if (!pnfs_enabled_sb(NFS_SERVER(ino))) { 1498 if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1522 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1499 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1523 PNFS_UPDATE_LAYOUT_NO_PNFS); 1500 PNFS_UPDATE_LAYOUT_NO_PNFS);
1524 goto out; 1501 goto out;
1525 } 1502 }
1526 1503
1527 if (iomode == IOMODE_READ && i_size_read(ino) == 0) { 1504 if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
1528 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1505 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1529 PNFS_UPDATE_LAYOUT_RD_ZEROLEN); 1506 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
1530 goto out; 1507 goto out;
1531 } 1508 }
1532 1509
1533 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { 1510 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1534 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1511 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1535 PNFS_UPDATE_LAYOUT_MDSTHRESH); 1512 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1536 goto out; 1513 goto out;
1537 } 1514 }
@@ -1542,14 +1519,14 @@ lookup_again:
1542 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1519 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1543 if (lo == NULL) { 1520 if (lo == NULL) {
1544 spin_unlock(&ino->i_lock); 1521 spin_unlock(&ino->i_lock);
1545 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1522 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1546 PNFS_UPDATE_LAYOUT_NOMEM); 1523 PNFS_UPDATE_LAYOUT_NOMEM);
1547 goto out; 1524 goto out;
1548 } 1525 }
1549 1526
1550 /* Do we even need to bother with this? */ 1527 /* Do we even need to bother with this? */
1551 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1528 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1552 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1529 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1553 PNFS_UPDATE_LAYOUT_BULK_RECALL); 1530 PNFS_UPDATE_LAYOUT_BULK_RECALL);
1554 dprintk("%s matches recall, use MDS\n", __func__); 1531 dprintk("%s matches recall, use MDS\n", __func__);
1555 goto out_unlock; 1532 goto out_unlock;
@@ -1557,14 +1534,34 @@ lookup_again:
1557 1534
1558 /* if LAYOUTGET already failed once we don't try again */ 1535 /* if LAYOUTGET already failed once we don't try again */
1559 if (pnfs_layout_io_test_failed(lo, iomode)) { 1536 if (pnfs_layout_io_test_failed(lo, iomode)) {
1560 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1537 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1561 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); 1538 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1562 goto out_unlock; 1539 goto out_unlock;
1563 } 1540 }
1564 1541
1565 first = list_empty(&lo->plh_segs); 1542 lseg = pnfs_find_lseg(lo, &arg);
1566 if (first) { 1543 if (lseg) {
1567 /* The first layoutget for the file. Need to serialize per 1544 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1545 PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1546 goto out_unlock;
1547 }
1548
1549 if (!nfs4_valid_open_stateid(ctx->state)) {
1550 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1551 PNFS_UPDATE_LAYOUT_INVALID_OPEN);
1552 goto out_unlock;
1553 }
1554
1555 /*
1556 * Choose a stateid for the LAYOUTGET. If we don't have a layout
1557 * stateid, or it has been invalidated, then we must use the open
1558 * stateid.
1559 */
1560 if (lo->plh_stateid.seqid == 0 ||
1561 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
1562
1563 /*
1564 * The first layoutget for the file. Need to serialize per
1568 * RFC 5661 Errata 3208. 1565 * RFC 5661 Errata 3208.
1569 */ 1566 */
1570 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, 1567 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1573,18 +1570,17 @@ lookup_again:
1573 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, 1570 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1574 TASK_UNINTERRUPTIBLE); 1571 TASK_UNINTERRUPTIBLE);
1575 pnfs_put_layout_hdr(lo); 1572 pnfs_put_layout_hdr(lo);
1573 dprintk("%s retrying\n", __func__);
1576 goto lookup_again; 1574 goto lookup_again;
1577 } 1575 }
1576
1577 first = true;
1578 do {
1579 seq = read_seqbegin(&ctx->state->seqlock);
1580 nfs4_stateid_copy(&stateid, &ctx->state->stateid);
1581 } while (read_seqretry(&ctx->state->seqlock, seq));
1578 } else { 1582 } else {
1579 /* Check to see if the layout for the given range 1583 nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1580 * already exists
1581 */
1582 lseg = pnfs_find_lseg(lo, &arg);
1583 if (lseg) {
1584 trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1585 PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1586 goto out_unlock;
1587 }
1588 } 1584 }
1589 1585
1590 /* 1586 /*
@@ -1599,15 +1595,17 @@ lookup_again:
1599 pnfs_clear_first_layoutget(lo); 1595 pnfs_clear_first_layoutget(lo);
1600 pnfs_put_layout_hdr(lo); 1596 pnfs_put_layout_hdr(lo);
1601 dprintk("%s retrying\n", __func__); 1597 dprintk("%s retrying\n", __func__);
1598 trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1599 lseg, PNFS_UPDATE_LAYOUT_RETRY);
1602 goto lookup_again; 1600 goto lookup_again;
1603 } 1601 }
1604 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1602 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1605 PNFS_UPDATE_LAYOUT_RETURN); 1603 PNFS_UPDATE_LAYOUT_RETURN);
1606 goto out_put_layout_hdr; 1604 goto out_put_layout_hdr;
1607 } 1605 }
1608 1606
1609 if (pnfs_layoutgets_blocked(lo)) { 1607 if (pnfs_layoutgets_blocked(lo)) {
1610 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1608 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1611 PNFS_UPDATE_LAYOUT_BLOCKED); 1609 PNFS_UPDATE_LAYOUT_BLOCKED);
1612 goto out_unlock; 1610 goto out_unlock;
1613 } 1611 }
@@ -1632,26 +1630,36 @@ lookup_again:
1632 if (arg.length != NFS4_MAX_UINT64) 1630 if (arg.length != NFS4_MAX_UINT64)
1633 arg.length = PAGE_ALIGN(arg.length); 1631 arg.length = PAGE_ALIGN(arg.length);
1634 1632
1635 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1633 lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
1634 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1635 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1636 if (IS_ERR(lseg)) { 1636 if (IS_ERR(lseg)) {
1637 if (lseg == ERR_PTR(-EAGAIN)) { 1637 switch(PTR_ERR(lseg)) {
1638 case -ERECALLCONFLICT:
1639 if (time_after(jiffies, giveup))
1640 lseg = NULL;
1641 /* Fallthrough */
1642 case -EAGAIN:
1643 pnfs_put_layout_hdr(lo);
1638 if (first) 1644 if (first)
1639 pnfs_clear_first_layoutget(lo); 1645 pnfs_clear_first_layoutget(lo);
1640 pnfs_put_layout_hdr(lo); 1646 if (lseg) {
1641 goto lookup_again; 1647 trace_pnfs_update_layout(ino, pos, count,
1642 } 1648 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1643 1649 goto lookup_again;
1644 if (!nfs_error_is_fatal(PTR_ERR(lseg))) { 1650 }
1645 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 1651 /* Fallthrough */
1646 lseg = NULL; 1652 default:
1653 if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
1654 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1655 lseg = NULL;
1656 }
1647 } 1657 }
1648 } else { 1658 } else {
1649 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 1659 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1650 } 1660 }
1651 1661
1652 atomic_dec(&lo->plh_outstanding); 1662 atomic_dec(&lo->plh_outstanding);
1653 trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1654 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1655out_put_layout_hdr: 1663out_put_layout_hdr:
1656 if (first) 1664 if (first)
1657 pnfs_clear_first_layoutget(lo); 1665 pnfs_clear_first_layoutget(lo);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 971068b58647..f9f3331bef49 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -228,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
228extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 228extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
229 struct pnfs_device *dev, 229 struct pnfs_device *dev,
230 struct rpc_cred *cred); 230 struct rpc_cred *cred);
231extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 231extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
232extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); 232extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
233 233
234/* pnfs.c */ 234/* pnfs.c */
@@ -260,10 +260,6 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
260void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 260void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
261 const nfs4_stateid *new, 261 const nfs4_stateid *new,
262 bool update_barrier); 262 bool update_barrier);
263int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
264 struct pnfs_layout_hdr *lo,
265 const struct pnfs_layout_range *range,
266 struct nfs4_state *open_state);
267int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 263int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
268 struct list_head *tmp_list, 264 struct list_head *tmp_list,
269 const struct pnfs_layout_range *recall_range, 265 const struct pnfs_layout_range *recall_range,
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 89627b9187f9..7ce9fb1b7d28 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,5 +28,6 @@
28#define EBADTYPE 527 /* Type not supported by server */ 28#define EBADTYPE 527 /* Type not supported by server */
29#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ 29#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
30#define EIOCBQUEUED 529 /* iocb queued, will get completion event */ 30#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
31#define ERECALLCONFLICT 530 /* conflict with recalled state */
31 32
32#endif 33#endif
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e1692c96cbc8..bfed6b367350 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -637,7 +637,9 @@ enum pnfs_update_layout_reason {
637 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, 637 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
638 PNFS_UPDATE_LAYOUT_FOUND_CACHED, 638 PNFS_UPDATE_LAYOUT_FOUND_CACHED,
639 PNFS_UPDATE_LAYOUT_RETURN, 639 PNFS_UPDATE_LAYOUT_RETURN,
640 PNFS_UPDATE_LAYOUT_RETRY,
640 PNFS_UPDATE_LAYOUT_BLOCKED, 641 PNFS_UPDATE_LAYOUT_BLOCKED,
642 PNFS_UPDATE_LAYOUT_INVALID_OPEN,
641 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, 643 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
642}; 644};
643 645
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e70ed54dad94..ccb2928a0e64 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -233,7 +233,6 @@ struct nfs4_layoutget_args {
233 struct inode *inode; 233 struct inode *inode;
234 struct nfs_open_context *ctx; 234 struct nfs_open_context *ctx;
235 nfs4_stateid stateid; 235 nfs4_stateid stateid;
236 unsigned long timestamp;
237 struct nfs4_layoutdriver_data layout; 236 struct nfs4_layoutdriver_data layout;
238}; 237};
239 238
@@ -251,7 +250,6 @@ struct nfs4_layoutget {
251 struct nfs4_layoutget_res res; 250 struct nfs4_layoutget_res res;
252 struct rpc_cred *cred; 251 struct rpc_cred *cred;
253 gfp_t gfp_flags; 252 gfp_t gfp_flags;
254 long timeout;
255}; 253};
256 254
257struct nfs4_getdeviceinfo_args { 255struct nfs4_getdeviceinfo_args {