aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/pnfs.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfs/pnfs.c')
-rw-r--r--fs/nfs/pnfs.c471
1 files changed, 381 insertions, 90 deletions
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
34#include "pnfs.h" 34#include "pnfs.h"
35#include "iostat.h" 35#include "iostat.h"
36#include "nfs4trace.h" 36#include "nfs4trace.h"
37#include "delegation.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_PNFS 39#define NFSDBG_FACILITY NFSDBG_PNFS
39#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 40#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
50 */ 51 */
51static LIST_HEAD(pnfs_modules_tbl); 52static LIST_HEAD(pnfs_modules_tbl);
52 53
54static int
55pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
56 enum pnfs_iomode iomode, bool sync);
57
53/* Return the registered pnfs layout driver module matching given id */ 58/* Return the registered pnfs layout driver module matching given id */
54static struct pnfs_layoutdriver_type * 59static struct pnfs_layoutdriver_type *
55find_pnfs_driver_locked(u32 id) 60find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
238 struct inode *inode = lo->plh_inode; 243 struct inode *inode = lo->plh_inode;
239 244
240 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 245 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
246 if (!list_empty(&lo->plh_segs))
247 WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
241 pnfs_detach_layout_hdr(lo); 248 pnfs_detach_layout_hdr(lo);
242 spin_unlock(&inode->i_lock); 249 spin_unlock(&inode->i_lock);
243 pnfs_free_layout_hdr(lo); 250 pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
337 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 344 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
338} 345}
339 346
347/* Return true if layoutreturn is needed */
348static bool
349pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
350 struct pnfs_layout_segment *lseg)
351{
352 struct pnfs_layout_segment *s;
353
354 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
355 return false;
356
357 list_for_each_entry(s, &lo->plh_segs, pls_list)
358 if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
359 return false;
360
361 return true;
362}
363
364static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
365 struct pnfs_layout_hdr *lo, struct inode *inode)
366{
367 lo = lseg->pls_layout;
368 inode = lo->plh_inode;
369
370 spin_lock(&inode->i_lock);
371 if (pnfs_layout_need_return(lo, lseg)) {
372 nfs4_stateid stateid;
373 enum pnfs_iomode iomode;
374
375 stateid = lo->plh_stateid;
376 iomode = lo->plh_return_iomode;
377 /* decreased in pnfs_send_layoutreturn() */
378 lo->plh_block_lgets++;
379 lo->plh_return_iomode = 0;
380 spin_unlock(&inode->i_lock);
381 pnfs_get_layout_hdr(lo);
382
383 /* Send an async layoutreturn so we dont deadlock */
384 pnfs_send_layoutreturn(lo, stateid, iomode, false);
385 } else
386 spin_unlock(&inode->i_lock);
387}
388
340void 389void
341pnfs_put_lseg(struct pnfs_layout_segment *lseg) 390pnfs_put_lseg(struct pnfs_layout_segment *lseg)
342{ 391{
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
349 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 398 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
350 atomic_read(&lseg->pls_refcount), 399 atomic_read(&lseg->pls_refcount),
351 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 400 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
401
402 /* Handle the case where refcount != 1 */
403 if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
404 return;
405
352 lo = lseg->pls_layout; 406 lo = lseg->pls_layout;
353 inode = lo->plh_inode; 407 inode = lo->plh_inode;
408 /* Do we need a layoutreturn? */
409 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
410 pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
411
354 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 412 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
355 pnfs_get_layout_hdr(lo); 413 pnfs_get_layout_hdr(lo);
356 pnfs_layout_remove_lseg(lo, lseg); 414 pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
543 pnfs_get_layout_hdr(lo); 601 pnfs_get_layout_hdr(lo);
544 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 602 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
545 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 603 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
604 pnfs_clear_retry_layoutget(lo);
546 spin_unlock(&nfsi->vfs_inode.i_lock); 605 spin_unlock(&nfsi->vfs_inode.i_lock);
547 pnfs_free_lseg_list(&tmp_list); 606 pnfs_free_lseg_list(&tmp_list);
548 pnfs_put_layout_hdr(lo); 607 pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
740 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 799 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
741} 800}
742 801
802static bool
803pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
804 struct pnfs_layout_range *range)
805{
806 return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
807 (lo->plh_return_iomode == IOMODE_ANY ||
808 lo->plh_return_iomode == range->iomode);
809}
810
743/* lget is set to 1 if called from inside send_layoutget call chain */ 811/* lget is set to 1 if called from inside send_layoutget call chain */
744static bool 812static bool
745pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) 813pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
814 struct pnfs_layout_range *range, int lget)
746{ 815{
747 return lo->plh_block_lgets || 816 return lo->plh_block_lgets ||
748 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 817 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
749 (list_empty(&lo->plh_segs) && 818 (list_empty(&lo->plh_segs) &&
750 (atomic_read(&lo->plh_outstanding) > lget)); 819 (atomic_read(&lo->plh_outstanding) > lget)) ||
820 pnfs_layout_returning(lo, range);
751} 821}
752 822
753int 823int
754pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 824pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
825 struct pnfs_layout_range *range,
755 struct nfs4_state *open_state) 826 struct nfs4_state *open_state)
756{ 827{
757 int status = 0; 828 int status = 0;
758 829
759 dprintk("--> %s\n", __func__); 830 dprintk("--> %s\n", __func__);
760 spin_lock(&lo->plh_inode->i_lock); 831 spin_lock(&lo->plh_inode->i_lock);
761 if (pnfs_layoutgets_blocked(lo, 1)) { 832 if (pnfs_layoutgets_blocked(lo, range, 1)) {
762 status = -EAGAIN; 833 status = -EAGAIN;
763 } else if (!nfs4_valid_open_stateid(open_state)) { 834 } else if (!nfs4_valid_open_stateid(open_state)) {
764 status = -EBADF; 835 status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
825 pnfs_layout_io_set_failed(lo, range->iomode); 896 pnfs_layout_io_set_failed(lo, range->iomode);
826 } 897 }
827 return NULL; 898 return NULL;
828 } 899 } else
900 pnfs_layout_clear_fail_bit(lo,
901 pnfs_iomode_to_fail_bit(range->iomode));
829 902
830 return lseg; 903 return lseg;
831} 904}
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
845 } 918 }
846} 919}
847 920
921void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
922{
923 clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
924 smp_mb__after_atomic();
925 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
926}
927
928static int
929pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
930 enum pnfs_iomode iomode, bool sync)
931{
932 struct inode *ino = lo->plh_inode;
933 struct nfs4_layoutreturn *lrp;
934 int status = 0;
935
936 lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
937 if (unlikely(lrp == NULL)) {
938 status = -ENOMEM;
939 spin_lock(&ino->i_lock);
940 lo->plh_block_lgets--;
941 pnfs_clear_layoutreturn_waitbit(lo);
942 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
943 spin_unlock(&ino->i_lock);
944 pnfs_put_layout_hdr(lo);
945 goto out;
946 }
947
948 lrp->args.stateid = stateid;
949 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
950 lrp->args.inode = ino;
951 lrp->args.range.iomode = iomode;
952 lrp->args.range.offset = 0;
953 lrp->args.range.length = NFS4_MAX_UINT64;
954 lrp->args.layout = lo;
955 lrp->clp = NFS_SERVER(ino)->nfs_client;
956 lrp->cred = lo->plh_lc_cred;
957
958 status = nfs4_proc_layoutreturn(lrp, sync);
959out:
960 dprintk("<-- %s status: %d\n", __func__, status);
961 return status;
962}
963
848/* 964/*
849 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 965 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
850 * when the layout segment list is empty. 966 * when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
859 struct pnfs_layout_hdr *lo = NULL; 975 struct pnfs_layout_hdr *lo = NULL;
860 struct nfs_inode *nfsi = NFS_I(ino); 976 struct nfs_inode *nfsi = NFS_I(ino);
861 LIST_HEAD(tmp_list); 977 LIST_HEAD(tmp_list);
862 struct nfs4_layoutreturn *lrp;
863 nfs4_stateid stateid; 978 nfs4_stateid stateid;
864 int status = 0, empty; 979 int status = 0, empty;
865 980
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
901 spin_unlock(&ino->i_lock); 1016 spin_unlock(&ino->i_lock);
902 pnfs_free_lseg_list(&tmp_list); 1017 pnfs_free_lseg_list(&tmp_list);
903 1018
904 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 1019 status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
905 if (unlikely(lrp == NULL)) {
906 status = -ENOMEM;
907 spin_lock(&ino->i_lock);
908 lo->plh_block_lgets--;
909 spin_unlock(&ino->i_lock);
910 pnfs_put_layout_hdr(lo);
911 goto out;
912 }
913
914 lrp->args.stateid = stateid;
915 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
916 lrp->args.inode = ino;
917 lrp->args.layout = lo;
918 lrp->clp = NFS_SERVER(ino)->nfs_client;
919 lrp->cred = lo->plh_lc_cred;
920
921 status = nfs4_proc_layoutreturn(lrp);
922out: 1020out:
923 dprintk("<-- %s status: %d\n", __func__, status); 1021 dprintk("<-- %s status: %d\n", __func__, status);
924 return status; 1022 return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
954 1052
955bool pnfs_roc(struct inode *ino) 1053bool pnfs_roc(struct inode *ino)
956{ 1054{
1055 struct nfs_inode *nfsi = NFS_I(ino);
1056 struct nfs_open_context *ctx;
1057 struct nfs4_state *state;
957 struct pnfs_layout_hdr *lo; 1058 struct pnfs_layout_hdr *lo;
958 struct pnfs_layout_segment *lseg, *tmp; 1059 struct pnfs_layout_segment *lseg, *tmp;
1060 nfs4_stateid stateid;
959 LIST_HEAD(tmp_list); 1061 LIST_HEAD(tmp_list);
960 bool found = false; 1062 bool found = false, layoutreturn = false;
961 1063
962 spin_lock(&ino->i_lock); 1064 spin_lock(&ino->i_lock);
963 lo = NFS_I(ino)->layout; 1065 lo = nfsi->layout;
964 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 1066 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
965 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 1067 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
966 goto out_nolayout; 1068 goto out_noroc;
1069
1070 /* Don't return layout if we hold a delegation */
1071 if (nfs4_check_delegation(ino, FMODE_READ))
1072 goto out_noroc;
1073
1074 list_for_each_entry(ctx, &nfsi->open_files, list) {
1075 state = ctx->state;
1076 /* Don't return layout if there is open file state */
1077 if (state != NULL && state->state != 0)
1078 goto out_noroc;
1079 }
1080
1081 pnfs_clear_retry_layoutget(lo);
967 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1082 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
968 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1083 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
969 mark_lseg_invalid(lseg, &tmp_list); 1084 mark_lseg_invalid(lseg, &tmp_list);
970 found = true; 1085 found = true;
971 } 1086 }
972 if (!found) 1087 if (!found)
973 goto out_nolayout; 1088 goto out_noroc;
974 lo->plh_block_lgets++; 1089 lo->plh_block_lgets++;
975 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
976 spin_unlock(&ino->i_lock); 1091 spin_unlock(&ino->i_lock);
977 pnfs_free_lseg_list(&tmp_list); 1092 pnfs_free_lseg_list(&tmp_list);
978 return true; 1093 return true;
979 1094
980out_nolayout: 1095out_noroc:
1096 if (lo) {
1097 stateid = lo->plh_stateid;
1098 layoutreturn =
1099 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1100 &lo->plh_flags);
1101 if (layoutreturn) {
1102 lo->plh_block_lgets++;
1103 pnfs_get_layout_hdr(lo);
1104 }
1105 }
981 spin_unlock(&ino->i_lock); 1106 spin_unlock(&ino->i_lock);
1107 if (layoutreturn)
1108 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
982 return false; 1109 return false;
983} 1110}
984 1111
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1013 struct nfs_inode *nfsi = NFS_I(ino); 1140 struct nfs_inode *nfsi = NFS_I(ino);
1014 struct pnfs_layout_hdr *lo; 1141 struct pnfs_layout_hdr *lo;
1015 struct pnfs_layout_segment *lseg; 1142 struct pnfs_layout_segment *lseg;
1143 nfs4_stateid stateid;
1016 u32 current_seqid; 1144 u32 current_seqid;
1017 bool found = false; 1145 bool found = false, layoutreturn = false;
1018 1146
1019 spin_lock(&ino->i_lock); 1147 spin_lock(&ino->i_lock);
1020 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 1148 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1031 */ 1159 */
1032 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 1160 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1033out: 1161out:
1162 if (!found) {
1163 stateid = lo->plh_stateid;
1164 layoutreturn =
1165 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1166 &lo->plh_flags);
1167 if (layoutreturn) {
1168 lo->plh_block_lgets++;
1169 pnfs_get_layout_hdr(lo);
1170 }
1171 }
1034 spin_unlock(&ino->i_lock); 1172 spin_unlock(&ino->i_lock);
1173 if (layoutreturn) {
1174 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1175 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
1176 }
1035 return found; 1177 return found;
1036} 1178}
1037 1179
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1178 1320
1179 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1321 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1180 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1322 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1323 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1181 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1324 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1182 ret = pnfs_get_lseg(lseg); 1325 ret = pnfs_get_lseg(lseg);
1183 break; 1326 break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1266 return ret; 1409 return ret;
1267} 1410}
1268 1411
1412/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
1413static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
1414{
1415 if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
1416 return 1;
1417 return nfs_wait_bit_killable(key);
1418}
1419
1420static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1421{
1422 /*
1423 * send layoutcommit as it can hold up layoutreturn due to lseg
1424 * reference
1425 */
1426 pnfs_layoutcommit_inode(lo->plh_inode, false);
1427 return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1428 pnfs_layoutget_retry_bit_wait,
1429 TASK_UNINTERRUPTIBLE);
1430}
1431
1432static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1433{
1434 unsigned long *bitlock = &lo->plh_flags;
1435
1436 clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1437 smp_mb__after_atomic();
1438 wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1439}
1440
1269/* 1441/*
1270 * Layout segment is retreived from the server if not cached. 1442 * Layout segment is retreived from the server if not cached.
1271 * The appropriate layout segment is referenced and returned to the caller. 1443 * The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
1296 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1468 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1297 goto out; 1469 goto out;
1298 1470
1471lookup_again:
1472 first = false;
1299 spin_lock(&ino->i_lock); 1473 spin_lock(&ino->i_lock);
1300 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1474 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1301 if (lo == NULL) { 1475 if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
1310 } 1484 }
1311 1485
1312 /* if LAYOUTGET already failed once we don't try again */ 1486 /* if LAYOUTGET already failed once we don't try again */
1313 if (pnfs_layout_io_test_failed(lo, iomode)) 1487 if (pnfs_layout_io_test_failed(lo, iomode) &&
1488 !pnfs_should_retry_layoutget(lo))
1314 goto out_unlock; 1489 goto out_unlock;
1315 1490
1316 /* Check to see if the layout for the given range already exists */ 1491 first = list_empty(&lo->plh_segs);
1317 lseg = pnfs_find_lseg(lo, &arg); 1492 if (first) {
1318 if (lseg) 1493 /* The first layoutget for the file. Need to serialize per
1319 goto out_unlock; 1494 * RFC 5661 Errata 3208.
1495 */
1496 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1497 &lo->plh_flags)) {
1498 spin_unlock(&ino->i_lock);
1499 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1500 TASK_UNINTERRUPTIBLE);
1501 pnfs_put_layout_hdr(lo);
1502 goto lookup_again;
1503 }
1504 } else {
1505 /* Check to see if the layout for the given range
1506 * already exists
1507 */
1508 lseg = pnfs_find_lseg(lo, &arg);
1509 if (lseg)
1510 goto out_unlock;
1511 }
1512
1513 /*
1514 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1515 * for LAYOUTRETURN even if first is true.
1516 */
1517 if (!lseg && pnfs_should_retry_layoutget(lo) &&
1518 test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1519 spin_unlock(&ino->i_lock);
1520 dprintk("%s wait for layoutreturn\n", __func__);
1521 if (pnfs_prepare_to_retry_layoutget(lo)) {
1522 if (first)
1523 pnfs_clear_first_layoutget(lo);
1524 pnfs_put_layout_hdr(lo);
1525 dprintk("%s retrying\n", __func__);
1526 goto lookup_again;
1527 }
1528 goto out_put_layout_hdr;
1529 }
1320 1530
1321 if (pnfs_layoutgets_blocked(lo, 0)) 1531 if (pnfs_layoutgets_blocked(lo, &arg, 0))
1322 goto out_unlock; 1532 goto out_unlock;
1323 atomic_inc(&lo->plh_outstanding); 1533 atomic_inc(&lo->plh_outstanding);
1324
1325 first = list_empty(&lo->plh_layouts) ? true : false;
1326 spin_unlock(&ino->i_lock); 1534 spin_unlock(&ino->i_lock);
1327 1535
1328 if (first) { 1536 if (list_empty(&lo->plh_layouts)) {
1329 /* The lo must be on the clp list if there is any 1537 /* The lo must be on the clp list if there is any
1330 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1538 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1331 */ 1539 */
1332 spin_lock(&clp->cl_lock); 1540 spin_lock(&clp->cl_lock);
1333 list_add_tail(&lo->plh_layouts, &server->layouts); 1541 if (list_empty(&lo->plh_layouts))
1542 list_add_tail(&lo->plh_layouts, &server->layouts);
1334 spin_unlock(&clp->cl_lock); 1543 spin_unlock(&clp->cl_lock);
1335 } 1544 }
1336 1545
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
1343 arg.length = PAGE_CACHE_ALIGN(arg.length); 1552 arg.length = PAGE_CACHE_ALIGN(arg.length);
1344 1553
1345 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1554 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1555 pnfs_clear_retry_layoutget(lo);
1346 atomic_dec(&lo->plh_outstanding); 1556 atomic_dec(&lo->plh_outstanding);
1347out_put_layout_hdr: 1557out_put_layout_hdr:
1558 if (first)
1559 pnfs_clear_first_layoutget(lo);
1348 pnfs_put_layout_hdr(lo); 1560 pnfs_put_layout_hdr(lo);
1349out: 1561out:
1350 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1562 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1393 goto out_forget_reply; 1605 goto out_forget_reply;
1394 } 1606 }
1395 1607
1396 if (pnfs_layoutgets_blocked(lo, 1)) { 1608 if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
1397 dprintk("%s forget reply due to state\n", __func__); 1609 dprintk("%s forget reply due to state\n", __func__);
1398 goto out_forget_reply; 1610 goto out_forget_reply;
1399 } 1611 }
@@ -1440,24 +1652,79 @@ out_forget_reply:
1440 goto out; 1652 goto out;
1441} 1653}
1442 1654
1655static void
1656pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1657 struct list_head *tmp_list,
1658 struct pnfs_layout_range *return_range)
1659{
1660 struct pnfs_layout_segment *lseg, *next;
1661
1662 dprintk("%s:Begin lo %p\n", __func__, lo);
1663
1664 if (list_empty(&lo->plh_segs))
1665 return;
1666
1667 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1668 if (should_free_lseg(&lseg->pls_range, return_range)) {
1669 dprintk("%s: marking lseg %p iomode %d "
1670 "offset %llu length %llu\n", __func__,
1671 lseg, lseg->pls_range.iomode,
1672 lseg->pls_range.offset,
1673 lseg->pls_range.length);
1674 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1675 mark_lseg_invalid(lseg, tmp_list);
1676 }
1677}
1678
1679void pnfs_error_mark_layout_for_return(struct inode *inode,
1680 struct pnfs_layout_segment *lseg)
1681{
1682 struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1683 int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
1684 struct pnfs_layout_range range = {
1685 .iomode = lseg->pls_range.iomode,
1686 .offset = 0,
1687 .length = NFS4_MAX_UINT64,
1688 };
1689 LIST_HEAD(free_me);
1690
1691 spin_lock(&inode->i_lock);
1692 /* set failure bit so that pnfs path will be retried later */
1693 pnfs_layout_set_fail_bit(lo, iomode);
1694 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1695 if (lo->plh_return_iomode == 0)
1696 lo->plh_return_iomode = range.iomode;
1697 else if (lo->plh_return_iomode != range.iomode)
1698 lo->plh_return_iomode = IOMODE_ANY;
1699 /*
1700 * mark all matching lsegs so that we are sure to have no live
1701 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1702 * for how it works.
1703 */
1704 pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
1705 spin_unlock(&inode->i_lock);
1706 pnfs_free_lseg_list(&free_me);
1707}
1708EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1709
1443void 1710void
1444pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1711pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1445{ 1712{
1446 u64 rd_size = req->wb_bytes; 1713 u64 rd_size = req->wb_bytes;
1447 1714
1448 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1715 if (pgio->pg_lseg == NULL) {
1449 1716 if (pgio->pg_dreq == NULL)
1450 if (pgio->pg_dreq == NULL) 1717 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1451 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1718 else
1452 else 1719 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1453 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1720
1454 1721 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1455 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1722 req->wb_context,
1456 req->wb_context, 1723 req_offset(req),
1457 req_offset(req), 1724 rd_size,
1458 rd_size, 1725 IOMODE_READ,
1459 IOMODE_READ, 1726 GFP_KERNEL);
1460 GFP_KERNEL); 1727 }
1461 /* If no lseg, fall back to read through mds */ 1728 /* If no lseg, fall back to read through mds */
1462 if (pgio->pg_lseg == NULL) 1729 if (pgio->pg_lseg == NULL)
1463 nfs_pageio_reset_read_mds(pgio); 1730 nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
1469pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1736pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1470 struct nfs_page *req, u64 wb_size) 1737 struct nfs_page *req, u64 wb_size)
1471{ 1738{
1472 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1739 if (pgio->pg_lseg == NULL)
1473 1740 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1474 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1741 req->wb_context,
1475 req->wb_context, 1742 req_offset(req),
1476 req_offset(req), 1743 wb_size,
1477 wb_size, 1744 IOMODE_RW,
1478 IOMODE_RW, 1745 GFP_NOFS);
1479 GFP_NOFS);
1480 /* If no lseg, fall back to write through mds */ 1746 /* If no lseg, fall back to write through mds */
1481 if (pgio->pg_lseg == NULL) 1747 if (pgio->pg_lseg == NULL)
1482 nfs_pageio_reset_write_mds(pgio); 1748 nfs_pageio_reset_write_mds(pgio);
1483} 1749}
1484EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1750EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1485 1751
1752void
1753pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1754{
1755 if (desc->pg_lseg) {
1756 pnfs_put_lseg(desc->pg_lseg);
1757 desc->pg_lseg = NULL;
1758 }
1759}
1760EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1761
1486/* 1762/*
1487 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number 1763 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1488 * of bytes (maximum @req->wb_bytes) that can be coalesced. 1764 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1489 */ 1765 */
1490size_t 1766size_t
1491pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1767pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1492 struct nfs_page *req) 1768 struct nfs_page *prev, struct nfs_page *req)
1493{ 1769{
1494 unsigned int size; 1770 unsigned int size;
1495 u64 seg_end, req_start, seg_left; 1771 u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1513 seg_end = end_offset(pgio->pg_lseg->pls_range.offset, 1789 seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1514 pgio->pg_lseg->pls_range.length); 1790 pgio->pg_lseg->pls_range.length);
1515 req_start = req_offset(req); 1791 req_start = req_offset(req);
1516 WARN_ON_ONCE(req_start > seg_end); 1792 WARN_ON_ONCE(req_start >= seg_end);
1517 /* start of request is past the last byte of this segment */ 1793 /* start of request is past the last byte of this segment */
1518 if (req_start >= seg_end) 1794 if (req_start >= seg_end) {
1795 /* reference the new lseg */
1796 if (pgio->pg_ops->pg_cleanup)
1797 pgio->pg_ops->pg_cleanup(pgio);
1798 if (pgio->pg_ops->pg_init)
1799 pgio->pg_ops->pg_init(pgio, req);
1519 return 0; 1800 return 0;
1801 }
1520 1802
1521 /* adjust 'size' iff there are fewer bytes left in the 1803 /* adjust 'size' iff there are fewer bytes left in the
1522 * segment than what nfs_generic_pg_test returned */ 1804 * segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
1571pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1853pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1572 struct nfs_pgio_header *hdr) 1854 struct nfs_pgio_header *hdr)
1573{ 1855{
1856 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1857
1574 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1858 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1575 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1859 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1576 nfs_pageio_reset_write_mds(desc); 1860 nfs_pageio_reset_write_mds(desc);
1577 desc->pg_recoalesce = 1; 1861 mirror->pg_recoalesce = 1;
1578 } 1862 }
1579 nfs_pgio_data_destroy(hdr); 1863 nfs_pgio_data_destroy(hdr);
1580} 1864}
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
1608 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1892 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1609 enum pnfs_try_status trypnfs; 1893 enum pnfs_try_status trypnfs;
1610 1894
1611 desc->pg_lseg = NULL;
1612 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); 1895 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1613 if (trypnfs == PNFS_NOT_ATTEMPTED) 1896 if (trypnfs == PNFS_NOT_ATTEMPTED)
1614 pnfs_write_through_mds(desc, hdr); 1897 pnfs_write_through_mds(desc, hdr);
1615 pnfs_put_lseg(lseg);
1616} 1898}
1617 1899
1618static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1900static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1625int 1907int
1626pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1908pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1627{ 1909{
1910 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1911
1628 struct nfs_pgio_header *hdr; 1912 struct nfs_pgio_header *hdr;
1629 int ret; 1913 int ret;
1630 1914
1631 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 1915 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1632 if (!hdr) { 1916 if (!hdr) {
1633 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1917 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1634 pnfs_put_lseg(desc->pg_lseg);
1635 desc->pg_lseg = NULL;
1636 return -ENOMEM; 1918 return -ENOMEM;
1637 } 1919 }
1638 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1920 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1921
1639 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1922 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1640 ret = nfs_generic_pgio(desc, hdr); 1923 ret = nfs_generic_pgio(desc, hdr);
1641 if (ret != 0) { 1924 if (!ret)
1642 pnfs_put_lseg(desc->pg_lseg);
1643 desc->pg_lseg = NULL;
1644 } else
1645 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1925 pnfs_do_write(desc, hdr, desc->pg_ioflags);
1926
1646 return ret; 1927 return ret;
1647} 1928}
1648EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1929EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
1687pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1968pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1688 struct nfs_pgio_header *hdr) 1969 struct nfs_pgio_header *hdr)
1689{ 1970{
1971 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1972
1690 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1973 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1691 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1974 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1692 nfs_pageio_reset_read_mds(desc); 1975 nfs_pageio_reset_read_mds(desc);
1693 desc->pg_recoalesce = 1; 1976 mirror->pg_recoalesce = 1;
1694 } 1977 }
1695 nfs_pgio_data_destroy(hdr); 1978 nfs_pgio_data_destroy(hdr);
1696} 1979}
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
1719 return trypnfs; 2002 return trypnfs;
1720} 2003}
1721 2004
2005/* Resend all requests through pnfs. */
2006int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2007{
2008 struct nfs_pageio_descriptor pgio;
2009
2010 nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
2011 return nfs_pageio_resend(&pgio, hdr);
2012}
2013EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2014
1722static void 2015static void
1723pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 2016pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
1724{ 2017{
1725 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2018 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1726 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2019 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1727 enum pnfs_try_status trypnfs; 2020 enum pnfs_try_status trypnfs;
2021 int err = 0;
1728 2022
1729 desc->pg_lseg = NULL;
1730 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 2023 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
1731 if (trypnfs == PNFS_NOT_ATTEMPTED) 2024 if (trypnfs == PNFS_TRY_AGAIN)
2025 err = pnfs_read_resend_pnfs(hdr);
2026 if (trypnfs == PNFS_NOT_ATTEMPTED || err)
1732 pnfs_read_through_mds(desc, hdr); 2027 pnfs_read_through_mds(desc, hdr);
1733 pnfs_put_lseg(lseg);
1734} 2028}
1735 2029
1736static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 2030static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1743int 2037int
1744pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 2038pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1745{ 2039{
2040 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2041
1746 struct nfs_pgio_header *hdr; 2042 struct nfs_pgio_header *hdr;
1747 int ret; 2043 int ret;
1748 2044
1749 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 2045 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1750 if (!hdr) { 2046 if (!hdr) {
1751 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 2047 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1752 ret = -ENOMEM; 2048 return -ENOMEM;
1753 pnfs_put_lseg(desc->pg_lseg);
1754 desc->pg_lseg = NULL;
1755 return ret;
1756 } 2049 }
1757 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 2050 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1758 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 2051 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1759 ret = nfs_generic_pgio(desc, hdr); 2052 ret = nfs_generic_pgio(desc, hdr);
1760 if (ret != 0) { 2053 if (!ret)
1761 pnfs_put_lseg(desc->pg_lseg);
1762 desc->pg_lseg = NULL;
1763 } else
1764 pnfs_do_read(desc, hdr); 2054 pnfs_do_read(desc, hdr);
1765 return ret; 2055 return ret;
1766} 2056}
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
1966 pnfs_clear_layoutcommitting(inode); 2256 pnfs_clear_layoutcommitting(inode);
1967 goto out; 2257 goto out;
1968} 2258}
2259EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
1969 2260
1970struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 2261struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1971{ 2262{