aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/exofs/Kconfig2
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/nfs4filelayout.c7
-rw-r--r--fs/nfs/nfs4proc.c6
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c872
-rw-r--r--fs/nfs/objlayout/objlayout.c209
-rw-r--r--fs/nfs/objlayout/objlayout.h48
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c25
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/sunrpc/clnt.h2
-rw-r--r--include/linux/sunrpc/svc.h1
-rw-r--r--net/sunrpc/auth_unix.c3
-rw-r--r--net/sunrpc/rpcb_clnt.c88
-rw-r--r--net/sunrpc/sunrpc_syms.c3
-rw-r--r--net/sunrpc/svc.c48
20 files changed, 457 insertions, 888 deletions
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index fa9a286c8771..da42f32c49be 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -5,7 +5,7 @@
5# selected by any of the users. 5# selected by any of the users.
6config ORE 6config ORE
7 tristate 7 tristate
8 depends on EXOFS_FS 8 depends on EXOFS_FS || PNFS_OBJLAYOUT
9 select ASYNC_XOR 9 select ASYNC_XOR
10 default SCSI_OSD_ULD 10 default SCSI_OSD_ULD
11 11
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afea..726e59a9e50f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
488 struct xdr_stream *xdr, 488 struct xdr_stream *xdr,
489 struct cb_recallanyargs *args) 489 struct cb_recallanyargs *args)
490{ 490{
491 __be32 *p; 491 uint32_t bitmap[2];
492 __be32 *p, status;
492 493
493 args->craa_addr = svc_addr(rqstp); 494 args->craa_addr = svc_addr(rqstp);
494 p = read_buf(xdr, 4); 495 p = read_buf(xdr, 4);
495 if (unlikely(p == NULL)) 496 if (unlikely(p == NULL))
496 return htonl(NFS4ERR_BADXDR); 497 return htonl(NFS4ERR_BADXDR);
497 args->craa_objs_to_keep = ntohl(*p++); 498 args->craa_objs_to_keep = ntohl(*p++);
498 p = read_buf(xdr, 4); 499 status = decode_bitmap(xdr, bitmap);
499 if (unlikely(p == NULL)) 500 if (unlikely(status))
500 return htonl(NFS4ERR_BADXDR); 501 return status;
501 args->craa_type_mask = ntohl(*p); 502 args->craa_type_mask = bitmap[0];
502 503
503 return 0; 504 return 0;
504} 505}
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
986 .vs_proc = nfs4_callback_procedures1, 987 .vs_proc = nfs4_callback_procedures1,
987 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 988 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
988 .vs_dispatch = NULL, 989 .vs_dispatch = NULL,
990 .vs_hidden = 1,
989}; 991};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 91c01f0a4c3b..0a1f8312b4dc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
137static int 137static int
138nfs_file_release(struct inode *inode, struct file *filp) 138nfs_file_release(struct inode *inode, struct file *filp)
139{ 139{
140 struct dentry *dentry = filp->f_path.dentry;
141
142 dprintk("NFS: release(%s/%s)\n", 140 dprintk("NFS: release(%s/%s)\n",
143 dentry->d_parent->d_name.name, 141 filp->f_path.dentry->d_parent->d_name.name,
144 dentry->d_name.name); 142 filp->f_path.dentry->d_name.name);
145 143
146 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 144 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
147 return nfs_release(inode, filp); 145 return nfs_release(inode, filp);
@@ -228,14 +226,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
228 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 226 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
229 struct inode * inode = dentry->d_inode; 227 struct inode * inode = dentry->d_inode;
230 ssize_t result; 228 ssize_t result;
231 size_t count = iov_length(iov, nr_segs);
232 229
233 if (iocb->ki_filp->f_flags & O_DIRECT) 230 if (iocb->ki_filp->f_flags & O_DIRECT)
234 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 231 return nfs_file_direct_read(iocb, iov, nr_segs, pos);
235 232
236 dprintk("NFS: read(%s/%s, %lu@%lu)\n", 233 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
237 dentry->d_parent->d_name.name, dentry->d_name.name, 234 dentry->d_parent->d_name.name, dentry->d_name.name,
238 (unsigned long) count, (unsigned long) pos); 235 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
239 236
240 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 237 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
241 if (!result) { 238 if (!result) {
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 09119418402f..12185aadb349 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -449,9 +449,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
449 449
450 fl->dsaddr = dsaddr; 450 fl->dsaddr = dsaddr;
451 451
452 if (fl->first_stripe_index < 0 || 452 if (fl->first_stripe_index >= dsaddr->stripe_count) {
453 fl->first_stripe_index >= dsaddr->stripe_count) { 453 dprintk("%s Bad first_stripe_index %u\n",
454 dprintk("%s Bad first_stripe_index %d\n",
455 __func__, fl->first_stripe_index); 454 __func__, fl->first_stripe_index);
456 goto out_put; 455 goto out_put;
457 } 456 }
@@ -552,7 +551,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
552 551
553 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. 552 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
554 * Futher checking is done in filelayout_check_layout */ 553 * Futher checking is done in filelayout_check_layout */
555 if (fl->num_fh < 0 || fl->num_fh > 554 if (fl->num_fh >
556 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) 555 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
557 goto out_err; 556 goto out_err;
558 557
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413c986a..b60fddf606f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5950,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata)
5950{ 5950{
5951 struct nfs4_layoutcommit_data *data = calldata; 5951 struct nfs4_layoutcommit_data *data = calldata;
5952 struct pnfs_layout_segment *lseg, *tmp; 5952 struct pnfs_layout_segment *lseg, *tmp;
5953 unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
5953 5954
5954 pnfs_cleanup_layoutcommit(data); 5955 pnfs_cleanup_layoutcommit(data);
5955 /* Matched by references in pnfs_set_layoutcommit */ 5956 /* Matched by references in pnfs_set_layoutcommit */
@@ -5959,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata)
5959 &lseg->pls_flags)) 5960 &lseg->pls_flags))
5960 put_lseg(lseg); 5961 put_lseg(lseg);
5961 } 5962 }
5963
5964 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
5965 smp_mb__after_clear_bit();
5966 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
5967
5962 put_rpccred(data->cred); 5968 put_rpccred(data->cred);
5963 kfree(data); 5969 kfree(data);
5964} 5970}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4f..e6161b213ed1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
6602 if (status) 6602 if (status)
6603 goto out; 6603 goto out;
6604 status = decode_secinfo(xdr, res); 6604 status = decode_secinfo(xdr, res);
6605 if (status)
6606 goto out;
6607out: 6605out:
6608 return status; 6606 return status;
6609} 6607}
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc3..c807ab93140e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
38 */ 38 */
39 39
40#include <linux/module.h> 40#include <linux/module.h>
41#include <scsi/osd_initiator.h> 41#include <scsi/osd_ore.h>
42 42
43#include "objlayout.h" 43#include "objlayout.h"
44 44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD 45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46 46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent { 47struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node; 48 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od; 49 struct ore_dev od;
56}; 50};
57 51
58static void 52static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{ 54{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); 55 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62 56
63 dprintk("%s: free od=%p\n", __func__, de->od); 57 dprintk("%s: free od=%p\n", __func__, de->od.od);
64 osduld_put_device(de->od); 58 osduld_put_device(de->od.od);
65 kfree(de); 59 kfree(de);
66} 60}
67 61
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
98 nfss->pnfs_curr_ld, 92 nfss->pnfs_curr_ld,
99 nfss->nfs_client, 93 nfss->nfs_client,
100 d_id); 94 d_id);
101 de->od = od; 95 de->od.od = od;
102 96
103 d = nfs4_insert_deviceid_node(&de->id_node); 97 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node); 98 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) { 99 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od); 100 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
107 objio_free_deviceid_node(&de->id_node); 101 objio_free_deviceid_node(&de->id_node);
108 de = n; 102 de = n;
109 } 103 }
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
111 return de; 105 return de;
112} 106}
113 107
114struct caps_buffers {
115 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
116 u8 creds[OSD_CAP_LEN];
117};
118
119struct objio_segment { 108struct objio_segment {
120 struct pnfs_layout_segment lseg; 109 struct pnfs_layout_segment lseg;
121 110
122 struct pnfs_osd_object_cred *comps; 111 struct ore_layout layout;
123 112 struct ore_components oc;
124 unsigned mirrors_p1;
125 unsigned stripe_unit;
126 unsigned group_width; /* Data stripe_units without integrity comps */
127 u64 group_depth;
128 unsigned group_count;
129
130 unsigned max_io_size;
131
132 unsigned comps_index;
133 unsigned num_comps;
134 /* variable length */
135 struct objio_dev_ent *ods[];
136}; 113};
137 114
138static inline struct objio_segment * 115static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141 return container_of(lseg, struct objio_segment, lseg); 118 return container_of(lseg, struct objio_segment, lseg);
142} 119}
143 120
144struct objio_state;
145typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
146
147struct objio_state { 121struct objio_state {
148 /* Generic layer */ 122 /* Generic layer */
149 struct objlayout_io_state ol_state; 123 struct objlayout_io_res oir;
150 124
151 struct objio_segment *layout; 125 bool sync;
152 126 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
153 struct kref kref; 127 struct ore_io_state *ios;
154 objio_done_fn done;
155 void *private;
156
157 unsigned long length;
158 unsigned numdevs; /* Actually used devs in this IO */
159 /* A per-device variable array of size numdevs */
160 struct _objio_per_comp {
161 struct bio *bio;
162 struct osd_request *or;
163 unsigned long length;
164 u64 offset;
165 unsigned dev;
166 } per_dev[];
167}; 128};
168 129
169/* Send and wait for a get_device_info of devices in the layout, 130/* Send and wait for a get_device_info of devices in the layout,
170 then look them up with the osd_initiator library */ 131 then look them up with the osd_initiator library */
171static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, 132static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
172 struct objio_segment *objio_seg, unsigned comp, 133 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
173 gfp_t gfp_flags) 134 gfp_t gfp_flags)
174{ 135{
175 struct pnfs_osd_deviceaddr *deviceaddr; 136 struct pnfs_osd_deviceaddr *deviceaddr;
176 struct nfs4_deviceid *d_id;
177 struct objio_dev_ent *ode; 137 struct objio_dev_ent *ode;
178 struct osd_dev *od; 138 struct osd_dev *od;
179 struct osd_dev_info odi; 139 struct osd_dev_info odi;
180 int err; 140 int err;
181 141
182 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
183
184 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 142 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
185 if (ode) 143 if (ode) {
186 return ode; 144 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
145 return 0;
146 }
187 147
188 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 148 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
189 if (unlikely(err)) { 149 if (unlikely(err)) {
190 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", 150 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
191 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); 151 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
192 return ERR_PTR(err); 152 return err;
193 } 153 }
194 154
195 odi.systemid_len = deviceaddr->oda_systemid.len; 155 odi.systemid_len = deviceaddr->oda_systemid.len;
196 if (odi.systemid_len > sizeof(odi.systemid)) { 156 if (odi.systemid_len > sizeof(odi.systemid)) {
157 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
158 __func__, sizeof(odi.systemid));
197 err = -EINVAL; 159 err = -EINVAL;
198 goto out; 160 goto out;
199 } else if (odi.systemid_len) 161 } else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
218 180
219 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, 181 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
220 gfp_flags); 182 gfp_flags);
221 183 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
184 dprintk("Adding new dev_id(%llx:%llx)\n",
185 _DEVID_LO(d_id), _DEVID_HI(d_id));
222out: 186out:
223 dprintk("%s: return=%d\n", __func__, err);
224 objlayout_put_deviceinfo(deviceaddr); 187 objlayout_put_deviceinfo(deviceaddr);
225 return err ? ERR_PTR(err) : ode; 188 return err;
226} 189}
227 190
228static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 191static void copy_single_comp(struct ore_components *oc, unsigned c,
229 struct objio_segment *objio_seg, 192 struct pnfs_osd_object_cred *src_comp)
230 gfp_t gfp_flags)
231{ 193{
232 unsigned i; 194 struct ore_comp *ocomp = &oc->comps[c];
233 int err;
234 195
235 /* lookup all devices */ 196 WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
236 for (i = 0; i < objio_seg->num_comps; i++) { 197 WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
237 struct objio_dev_ent *ode;
238 198
239 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); 199 ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
240 if (unlikely(IS_ERR(ode))) { 200 ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
241 err = PTR_ERR(ode);
242 goto out;
243 }
244 objio_seg->ods[i] = ode;
245 }
246 err = 0;
247 201
248out: 202 memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
249 dprintk("%s: return=%d\n", __func__, err);
250 return err;
251} 203}
252 204
253static int _verify_data_map(struct pnfs_osd_layout *layout) 205int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
206 struct objio_segment **pseg)
254{ 207{
255 struct pnfs_osd_data_map *data_map = &layout->olo_map; 208 struct __alloc_objio_segment {
256 u64 stripe_length; 209 struct objio_segment olseg;
257 u32 group_width; 210 struct ore_dev *ods[numdevs];
258 211 struct ore_comp comps[numdevs];
259/* FIXME: Only raid0 for now. if not go through MDS */ 212 } *aolseg;
260 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
261 printk(KERN_ERR "Only RAID_0 for now\n");
262 return -ENOTSUPP;
263 }
264 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
265 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
266 data_map->odm_num_comps, data_map->odm_mirror_cnt);
267 return -EINVAL;
268 }
269 213
270 if (data_map->odm_group_width) 214 aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
271 group_width = data_map->odm_group_width; 215 if (unlikely(!aolseg)) {
272 else 216 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
273 group_width = data_map->odm_num_comps / 217 numdevs, sizeof(*aolseg));
274 (data_map->odm_mirror_cnt + 1); 218 return -ENOMEM;
275
276 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
277 if (stripe_length >= (1ULL << 32)) {
278 printk(KERN_ERR "Total Stripe length(0x%llx)"
279 " >= 32bit is not supported\n", _LLU(stripe_length));
280 return -ENOTSUPP;
281 } 219 }
282 220
283 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { 221 aolseg->olseg.oc.numdevs = numdevs;
284 printk(KERN_ERR "Stripe Unit(0x%llx)" 222 aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
285 " must be Multples of PAGE_SIZE(0x%lx)\n", 223 aolseg->olseg.oc.comps = aolseg->comps;
286 _LLU(data_map->odm_stripe_unit), PAGE_SIZE); 224 aolseg->olseg.oc.ods = aolseg->ods;
287 return -ENOTSUPP;
288 }
289 225
226 *pseg = &aolseg->olseg;
290 return 0; 227 return 0;
291} 228}
292 229
293static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
294 struct pnfs_osd_object_cred *src_comp,
295 struct caps_buffers *caps_p)
296{
297 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
298 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
299
300 *cur_comp = *src_comp;
301
302 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
303 sizeof(caps_p->caps_key));
304 cur_comp->oc_cap_key.cred = caps_p->caps_key;
305
306 memcpy(caps_p->creds, src_comp->oc_cap.cred,
307 sizeof(caps_p->creds));
308 cur_comp->oc_cap.cred = caps_p->creds;
309}
310
311int objio_alloc_lseg(struct pnfs_layout_segment **outp, 230int objio_alloc_lseg(struct pnfs_layout_segment **outp,
312 struct pnfs_layout_hdr *pnfslay, 231 struct pnfs_layout_hdr *pnfslay,
313 struct pnfs_layout_range *range, 232 struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
317 struct objio_segment *objio_seg; 236 struct objio_segment *objio_seg;
318 struct pnfs_osd_xdr_decode_layout_iter iter; 237 struct pnfs_osd_xdr_decode_layout_iter iter;
319 struct pnfs_osd_layout layout; 238 struct pnfs_osd_layout layout;
320 struct pnfs_osd_object_cred *cur_comp, src_comp; 239 struct pnfs_osd_object_cred src_comp;
321 struct caps_buffers *caps_p; 240 unsigned cur_comp;
322 int err; 241 int err;
323 242
324 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); 243 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
325 if (unlikely(err)) 244 if (unlikely(err))
326 return err; 245 return err;
327 246
328 err = _verify_data_map(&layout); 247 err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
329 if (unlikely(err)) 248 if (unlikely(err))
330 return err; 249 return err;
331 250
332 objio_seg = kzalloc(sizeof(*objio_seg) + 251 objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
333 sizeof(objio_seg->ods[0]) * layout.olo_num_comps + 252 objio_seg->layout.group_width = layout.olo_map.odm_group_width;
334 sizeof(*objio_seg->comps) * layout.olo_num_comps + 253 objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
335 sizeof(struct caps_buffers) * layout.olo_num_comps, 254 objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
336 gfp_flags); 255 objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
337 if (!objio_seg)
338 return -ENOMEM;
339 256
340 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); 257 err = ore_verify_layout(layout.olo_map.odm_num_comps,
341 cur_comp = objio_seg->comps; 258 &objio_seg->layout);
342 caps_p = (void *)(cur_comp + layout.olo_num_comps);
343 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
344 copy_single_comp(cur_comp++, &src_comp, caps_p++);
345 if (unlikely(err)) 259 if (unlikely(err))
346 goto err; 260 goto err;
347 261
348 objio_seg->num_comps = layout.olo_num_comps; 262 objio_seg->oc.first_dev = layout.olo_comps_index;
349 objio_seg->comps_index = layout.olo_comps_index; 263 cur_comp = 0;
350 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); 264 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
351 if (err) 265 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
352 goto err; 266 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
353 267 &src_comp.oc_object_id.oid_device_id,
354 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; 268 gfp_flags);
355 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; 269 if (err)
356 if (layout.olo_map.odm_group_width) { 270 goto err;
357 objio_seg->group_width = layout.olo_map.odm_group_width; 271 ++cur_comp;
358 objio_seg->group_depth = layout.olo_map.odm_group_depth;
359 objio_seg->group_count = layout.olo_map.odm_num_comps /
360 objio_seg->mirrors_p1 /
361 objio_seg->group_width;
362 } else {
363 objio_seg->group_width = layout.olo_map.odm_num_comps /
364 objio_seg->mirrors_p1;
365 objio_seg->group_depth = -1;
366 objio_seg->group_count = 1;
367 } 272 }
368 273 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
369 /* Cache this calculation it will hit for every page */ 274 if (unlikely(err))
370 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - 275 goto err;
371 objio_seg->stripe_unit) *
372 objio_seg->group_width;
373 276
374 *outp = &objio_seg->lseg; 277 *outp = &objio_seg->lseg;
375 return 0; 278 return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
386 int i; 289 int i;
387 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 290 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
388 291
389 for (i = 0; i < objio_seg->num_comps; i++) { 292 for (i = 0; i < objio_seg->oc.numdevs; i++) {
390 if (!objio_seg->ods[i]) 293 struct ore_dev *od = objio_seg->oc.ods[i];
294 struct objio_dev_ent *ode;
295
296 if (!od)
391 break; 297 break;
392 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); 298 ode = container_of(od, typeof(*ode), od);
299 nfs4_put_deviceid_node(&ode->id_node);
393 } 300 }
394 kfree(objio_seg); 301 kfree(objio_seg);
395} 302}
396 303
397int objio_alloc_io_state(struct pnfs_layout_segment *lseg, 304static int
398 struct objlayout_io_state **outp, 305objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
399 gfp_t gfp_flags) 306 struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
307 loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
308 struct objio_state **outp)
400{ 309{
401 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 310 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
402 struct objio_state *ios; 311 struct ore_io_state *ios;
403 const unsigned first_size = sizeof(*ios) + 312 int ret;
404 objio_seg->num_comps * sizeof(ios->per_dev[0]); 313 struct __alloc_objio_state {
405 const unsigned sec_size = objio_seg->num_comps * 314 struct objio_state objios;
406 sizeof(ios->ol_state.ioerrs[0]); 315 struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
407 316 } *aos;
408 ios = kzalloc(first_size + sec_size, gfp_flags); 317
409 if (unlikely(!ios)) 318 aos = kzalloc(sizeof(*aos), gfp_flags);
319 if (unlikely(!aos))
410 return -ENOMEM; 320 return -ENOMEM;
411 321
412 ios->layout = objio_seg; 322 objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
413 ios->ol_state.ioerrs = ((void *)ios) + first_size; 323 aos->ioerrs, rpcdata, pnfs_layout_type);
414 ios->ol_state.num_comps = objio_seg->num_comps;
415 324
416 *outp = &ios->ol_state; 325 ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
326 offset, count, &ios);
327 if (unlikely(ret)) {
328 kfree(aos);
329 return ret;
330 }
331
332 ios->pages = pages;
333 ios->pgbase = pgbase;
334 ios->private = aos;
335 BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
336
337 aos->objios.sync = 0;
338 aos->objios.ios = ios;
339 *outp = &aos->objios;
417 return 0; 340 return 0;
418} 341}
419 342
420void objio_free_io_state(struct objlayout_io_state *ol_state) 343void objio_free_result(struct objlayout_io_res *oir)
421{ 344{
422 struct objio_state *ios = container_of(ol_state, struct objio_state, 345 struct objio_state *objios = container_of(oir, struct objio_state, oir);
423 ol_state);
424 346
425 kfree(ios); 347 ore_put_io_state(objios->ios);
348 kfree(objios);
426} 349}
427 350
428enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) 351enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
455 } 378 }
456} 379}
457 380
458static void _clear_bio(struct bio *bio) 381static void __on_dev_error(struct ore_io_state *ios,
382 struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
383 u64 dev_offset, u64 dev_len)
459{ 384{
460 struct bio_vec *bv; 385 struct objio_state *objios = ios->private;
461 unsigned i; 386 struct pnfs_osd_objid pooid;
462 387 struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
463 __bio_for_each_segment(bv, bio, i, 0) { 388 /* FIXME: what to do with more-then-one-group layouts. We need to
464 unsigned this_count = bv->bv_len; 389 * translate from ore_io_state index to oc->comps index
465 390 */
466 if (likely(PAGE_SIZE == this_count)) 391 unsigned comp = dev_index;
467 clear_highpage(bv->bv_page);
468 else
469 zero_user(bv->bv_page, bv->bv_offset, this_count);
470 }
471}
472
473static int _io_check(struct objio_state *ios, bool is_write)
474{
475 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
476 int lin_ret = 0;
477 int i;
478
479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or;
482 int ret;
483
484 if (!or)
485 continue;
486 392
487 ret = osd_req_decode_sense(or, &osi); 393 pooid.oid_device_id = ode->id_node.deviceid;
488 if (likely(!ret)) 394 pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
489 continue; 395 pooid.oid_object_id = ios->oc->comps[comp].obj.id;
490 396
491 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 397 objlayout_io_set_result(&objios->oir, comp,
492 /* start read offset passed endof file */ 398 &pooid, osd_pri_2_pnfs_err(oep),
493 BUG_ON(is_write); 399 dev_offset, dev_len, !ios->reading);
494 _clear_bio(ios->per_dev[i].bio);
495 dprintk("%s: start read offset passed end of file "
496 "offset=0x%llx, length=0x%lx\n", __func__,
497 _LLU(ios->per_dev[i].offset),
498 ios->per_dev[i].length);
499
500 continue; /* we recovered */
501 }
502 objlayout_io_set_result(&ios->ol_state, i,
503 &ios->layout->comps[i].oc_object_id,
504 osd_pri_2_pnfs_err(osi.osd_err_pri),
505 ios->per_dev[i].offset,
506 ios->per_dev[i].length,
507 is_write);
508
509 if (osi.osd_err_pri >= oep) {
510 oep = osi.osd_err_pri;
511 lin_ret = ret;
512 }
513 }
514
515 return lin_ret;
516}
517
518/*
519 * Common IO state helpers.
520 */
521static void _io_free(struct objio_state *ios)
522{
523 unsigned i;
524
525 for (i = 0; i < ios->numdevs; i++) {
526 struct _objio_per_comp *per_dev = &ios->per_dev[i];
527
528 if (per_dev->or) {
529 osd_end_request(per_dev->or);
530 per_dev->or = NULL;
531 }
532
533 if (per_dev->bio) {
534 bio_put(per_dev->bio);
535 per_dev->bio = NULL;
536 }
537 }
538}
539
540struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
541{
542 unsigned min_dev = ios->layout->comps_index;
543 unsigned max_dev = min_dev + ios->layout->num_comps;
544
545 BUG_ON(dev < min_dev || max_dev <= dev);
546 return ios->layout->ods[dev - min_dev]->od;
547}
548
549struct _striping_info {
550 u64 obj_offset;
551 u64 group_length;
552 unsigned dev;
553 unsigned unit_off;
554};
555
556static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
557 struct _striping_info *si)
558{
559 u32 stripe_unit = ios->layout->stripe_unit;
560 u32 group_width = ios->layout->group_width;
561 u64 group_depth = ios->layout->group_depth;
562 u32 U = stripe_unit * group_width;
563
564 u64 T = U * group_depth;
565 u64 S = T * ios->layout->group_count;
566 u64 M = div64_u64(file_offset, S);
567
568 /*
569 G = (L - (M * S)) / T
570 H = (L - (M * S)) % T
571 */
572 u64 LmodU = file_offset - M * S;
573 u32 G = div64_u64(LmodU, T);
574 u64 H = LmodU - G * T;
575
576 u32 N = div_u64(H, U);
577
578 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
579 si->obj_offset = si->unit_off + (N * stripe_unit) +
580 (M * group_depth * stripe_unit);
581
582 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
583 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
584 si->dev *= ios->layout->mirrors_p1;
585
586 si->group_length = T - H;
587}
588
589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
591 gfp_t gfp_flags)
592{
593 unsigned pg = *cur_pg;
594 int cur_len = len;
595 struct request_queue *q =
596 osd_request_queue(_io_od(ios, per_dev->dev));
597
598 if (per_dev->bio == NULL) {
599 unsigned pages_in_stripe = ios->layout->group_width *
600 (ios->layout->stripe_unit / PAGE_SIZE);
601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
602 ios->layout->group_width;
603
604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
605 bio_size = BIO_MAX_PAGES_KMALLOC;
606
607 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
608 if (unlikely(!per_dev->bio)) {
609 dprintk("Faild to allocate BIO size=%u\n", bio_size);
610 return -ENOMEM;
611 }
612 }
613
614 while (cur_len > 0) {
615 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
616 unsigned added_len;
617
618 BUG_ON(ios->ol_state.nr_pages <= pg);
619 cur_len -= pglen;
620
621 added_len = bio_add_pc_page(q, per_dev->bio,
622 ios->ol_state.pages[pg], pglen, pgbase);
623 if (unlikely(pglen != added_len))
624 return -ENOMEM;
625 pgbase = 0;
626 ++pg;
627 }
628 BUG_ON(cur_len);
629
630 per_dev->length += len;
631 *cur_pg = pg;
632 return 0;
633}
634
635static int _prepare_one_group(struct objio_state *ios, u64 length,
636 struct _striping_info *si, unsigned *last_pg,
637 gfp_t gfp_flags)
638{
639 unsigned stripe_unit = ios->layout->stripe_unit;
640 unsigned mirrors_p1 = ios->layout->mirrors_p1;
641 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
642 unsigned dev = si->dev;
643 unsigned first_dev = dev - (dev % devs_in_group);
644 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
645 unsigned cur_pg = *last_pg;
646 int ret = 0;
647
648 while (length) {
649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
650 unsigned cur_len, page_off = 0;
651
652 if (!per_dev->length) {
653 per_dev->dev = dev;
654 if (dev < si->dev) {
655 per_dev->offset = si->obj_offset + stripe_unit -
656 si->unit_off;
657 cur_len = stripe_unit;
658 } else if (dev == si->dev) {
659 per_dev->offset = si->obj_offset;
660 cur_len = stripe_unit - si->unit_off;
661 page_off = si->unit_off & ~PAGE_MASK;
662 BUG_ON(page_off &&
663 (page_off != ios->ol_state.pgbase));
664 } else { /* dev > si->dev */
665 per_dev->offset = si->obj_offset - si->unit_off;
666 cur_len = stripe_unit;
667 }
668
669 if (max_comp < dev - first_dev)
670 max_comp = dev - first_dev;
671 } else {
672 cur_len = stripe_unit;
673 }
674 if (cur_len >= length)
675 cur_len = length;
676
677 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
678 cur_len, gfp_flags);
679 if (unlikely(ret))
680 goto out;
681
682 dev += mirrors_p1;
683 dev = (dev % devs_in_group) + first_dev;
684
685 length -= cur_len;
686 ios->length += cur_len;
687 }
688out:
689 ios->numdevs = max_comp + mirrors_p1;
690 *last_pg = cur_pg;
691 return ret;
692}
693
694static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
695{
696 u64 length = ios->ol_state.count;
697 u64 offset = ios->ol_state.offset;
698 struct _striping_info si;
699 unsigned last_pg = 0;
700 int ret = 0;
701
702 while (length) {
703 _calc_stripe_info(ios, offset, &si);
704
705 if (length < si.group_length)
706 si.group_length = length;
707
708 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
709 if (unlikely(ret))
710 goto out;
711
712 offset += si.group_length;
713 length -= si.group_length;
714 }
715
716out:
717 if (!ios->length)
718 return ret;
719
720 return 0;
721}
722
723static ssize_t _sync_done(struct objio_state *ios)
724{
725 struct completion *waiting = ios->private;
726
727 complete(waiting);
728 return 0;
729}
730
731static void _last_io(struct kref *kref)
732{
733 struct objio_state *ios = container_of(kref, struct objio_state, kref);
734
735 ios->done(ios);
736}
737
738static void _done_io(struct osd_request *or, void *p)
739{
740 struct objio_state *ios = p;
741
742 kref_put(&ios->kref, _last_io);
743}
744
745static ssize_t _io_exec(struct objio_state *ios)
746{
747 DECLARE_COMPLETION_ONSTACK(wait);
748 ssize_t status = 0; /* sync status */
749 unsigned i;
750 objio_done_fn saved_done_fn = ios->done;
751 bool sync = ios->ol_state.sync;
752
753 if (sync) {
754 ios->done = _sync_done;
755 ios->private = &wait;
756 }
757
758 kref_init(&ios->kref);
759
760 for (i = 0; i < ios->numdevs; i++) {
761 struct osd_request *or = ios->per_dev[i].or;
762
763 if (!or)
764 continue;
765
766 kref_get(&ios->kref);
767 osd_execute_request_async(or, _done_io, ios);
768 }
769
770 kref_put(&ios->kref, _last_io);
771
772 if (sync) {
773 wait_for_completion(&wait);
774 status = saved_done_fn(ios);
775 }
776
777 return status;
778} 400}
779 401
780/* 402/*
781 * read 403 * read
782 */ 404 */
783static ssize_t _read_done(struct objio_state *ios) 405static void _read_done(struct ore_io_state *ios, void *private)
784{ 406{
407 struct objio_state *objios = private;
785 ssize_t status; 408 ssize_t status;
786 int ret = _io_check(ios, false); 409 int ret = ore_check_io(ios, &__on_dev_error);
787 410
788 _io_free(ios); 411 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
789 412
790 if (likely(!ret)) 413 if (likely(!ret))
791 status = ios->length; 414 status = ios->length;
792 else 415 else
793 status = ret; 416 status = ret;
794 417
795 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); 418 objlayout_read_done(&objios->oir, status, objios->sync);
796 return status;
797} 419}
798 420
799static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) 421int objio_read_pagelist(struct nfs_read_data *rdata)
800{ 422{
801 struct osd_request *or = NULL; 423 struct objio_state *objios;
802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
803 unsigned dev = per_dev->dev;
804 struct pnfs_osd_object_cred *cred =
805 &ios->layout->comps[cur_comp];
806 struct osd_obj_id obj = {
807 .partition = cred->oc_object_id.oid_partition_id,
808 .id = cred->oc_object_id.oid_object_id,
809 };
810 int ret; 424 int ret;
811 425
812 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); 426 ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
813 if (unlikely(!or)) { 427 rdata->lseg, rdata->args.pages, rdata->args.pgbase,
814 ret = -ENOMEM; 428 rdata->args.offset, rdata->args.count, rdata,
815 goto err; 429 GFP_KERNEL, &objios);
816 }
817 per_dev->or = or;
818
819 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
820
821 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
822 if (ret) {
823 dprintk("%s: Faild to osd_finalize_request() => %d\n",
824 __func__, ret);
825 goto err;
826 }
827
828 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
829 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
830 per_dev->length);
831
832err:
833 return ret;
834}
835
836static ssize_t _read_exec(struct objio_state *ios)
837{
838 unsigned i;
839 int ret;
840
841 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
842 if (!ios->per_dev[i].length)
843 continue;
844 ret = _read_mirrors(ios, i);
845 if (unlikely(ret))
846 goto err;
847 }
848
849 ios->done = _read_done;
850 return _io_exec(ios); /* In sync mode exec returns the io status */
851
852err:
853 _io_free(ios);
854 return ret;
855}
856
857ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
858{
859 struct objio_state *ios = container_of(ol_state, struct objio_state,
860 ol_state);
861 int ret;
862
863 ret = _io_rw_pagelist(ios, GFP_KERNEL);
864 if (unlikely(ret)) 430 if (unlikely(ret))
865 return ret; 431 return ret;
866 432
867 return _read_exec(ios); 433 objios->ios->done = _read_done;
434 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
435 rdata->args.offset, rdata->args.count);
436 return ore_read(objios->ios);
868} 437}
869 438
870/* 439/*
871 * write 440 * write
872 */ 441 */
873static ssize_t _write_done(struct objio_state *ios) 442static void _write_done(struct ore_io_state *ios, void *private)
874{ 443{
444 struct objio_state *objios = private;
875 ssize_t status; 445 ssize_t status;
876 int ret = _io_check(ios, true); 446 int ret = ore_check_io(ios, &__on_dev_error);
877 447
878 _io_free(ios); 448 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
879 449
880 if (likely(!ret)) { 450 if (likely(!ret)) {
881 /* FIXME: should be based on the OSD's persistence model 451 /* FIXME: should be based on the OSD's persistence model
882 * See OSD2r05 Section 4.13 Data persistence model */ 452 * See OSD2r05 Section 4.13 Data persistence model */
883 ios->ol_state.committed = NFS_FILE_SYNC; 453 objios->oir.committed = NFS_FILE_SYNC;
884 status = ios->length; 454 status = ios->length;
885 } else { 455 } else {
886 status = ret; 456 status = ret;
887 } 457 }
888 458
889 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); 459 objlayout_write_done(&objios->oir, status, objios->sync);
890 return status;
891} 460}
892 461
893static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) 462static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
894{ 463{
895 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; 464 struct objio_state *objios = priv;
896 unsigned dev = ios->per_dev[cur_comp].dev; 465 struct nfs_write_data *wdata = objios->oir.rpcdata;
897 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 466 pgoff_t index = offset / PAGE_SIZE;
898 int ret; 467 struct page *page = find_get_page(wdata->inode->i_mapping, index);
899
900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
901 struct osd_request *or = NULL;
902 struct pnfs_osd_object_cred *cred =
903 &ios->layout->comps[cur_comp];
904 struct osd_obj_id obj = {
905 .partition = cred->oc_object_id.oid_partition_id,
906 .id = cred->oc_object_id.oid_object_id,
907 };
908 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
909 struct bio *bio;
910
911 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
912 if (unlikely(!or)) {
913 ret = -ENOMEM;
914 goto err;
915 }
916 per_dev->or = or;
917
918 if (per_dev != master_dev) {
919 bio = bio_kmalloc(GFP_NOFS,
920 master_dev->bio->bi_max_vecs);
921 if (unlikely(!bio)) {
922 dprintk("Faild to allocate BIO size=%u\n",
923 master_dev->bio->bi_max_vecs);
924 ret = -ENOMEM;
925 goto err;
926 }
927
928 __bio_clone(bio, master_dev->bio);
929 bio->bi_bdev = NULL;
930 bio->bi_next = NULL;
931 per_dev->bio = bio;
932 per_dev->dev = dev;
933 per_dev->length = master_dev->length;
934 per_dev->offset = master_dev->offset;
935 } else {
936 bio = master_dev->bio;
937 bio->bi_rw |= REQ_WRITE;
938 }
939
940 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
941 468
942 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); 469 if (!page) {
943 if (ret) { 470 page = find_or_create_page(wdata->inode->i_mapping,
944 dprintk("%s: Faild to osd_finalize_request() => %d\n", 471 index, GFP_NOFS);
945 __func__, ret); 472 if (unlikely(!page)) {
946 goto err; 473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
474 __func__, index);
475 return NULL;
947 } 476 }
948 477 unlock_page(page);
949 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
950 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
951 per_dev->length);
952 } 478 }
479 if (PageDirty(page) || PageWriteback(page))
480 *uptodate = true;
481 else
482 *uptodate = PageUptodate(page);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
484 return page;
485}
953 486
954err: 487static void __r4w_put_page(void *priv, struct page *page)
955 return ret; 488{
489 dprintk("%s: index=0x%lx\n", __func__, page->index);
490 page_cache_release(page);
491 return;
956} 492}
957 493
958static ssize_t _write_exec(struct objio_state *ios) 494static const struct _ore_r4w_op _r4w_op = {
495 .get_page = &__r4w_get_page,
496 .put_page = &__r4w_put_page,
497};
498
499int objio_write_pagelist(struct nfs_write_data *wdata, int how)
959{ 500{
960 unsigned i; 501 struct objio_state *objios;
961 int ret; 502 int ret;
962 503
963 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 504 ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
964 if (!ios->per_dev[i].length) 505 wdata->lseg, wdata->args.pages, wdata->args.pgbase,
965 continue; 506 wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
966 ret = _write_mirrors(ios, i); 507 &objios);
967 if (unlikely(ret)) 508 if (unlikely(ret))
968 goto err; 509 return ret;
969 }
970
971 ios->done = _write_done;
972 return _io_exec(ios); /* In sync mode exec returns the io->status */
973 510
974err: 511 objios->sync = 0 != (how & FLUSH_SYNC);
975 _io_free(ios); 512 objios->ios->r4w = &_r4w_op;
976 return ret;
977}
978 513
979ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) 514 if (!objios->sync)
980{ 515 objios->ios->done = _write_done;
981 struct objio_state *ios = container_of(ol_state, struct objio_state,
982 ol_state);
983 int ret;
984 516
985 /* TODO: ios->stable = stable; */ 517 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
986 ret = _io_rw_pagelist(ios, GFP_NOFS); 518 wdata->args.offset, wdata->args.count);
519 ret = ore_write(objios->ios);
987 if (unlikely(ret)) 520 if (unlikely(ret))
988 return ret; 521 return ret;
989 522
990 return _write_exec(ios); 523 if (objios->sync)
524 _write_done(objios->ios, objios);
525
526 return 0;
991} 527}
992 528
993static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, 529static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
997 return false; 533 return false;
998 534
999 return pgio->pg_count + req->wb_bytes <= 535 return pgio->pg_count + req->wb_bytes <=
1000 OBJIO_LSEG(pgio->pg_lseg)->max_io_size; 536 OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
1001} 537}
1002 538
1003static const struct nfs_pageio_ops objio_pg_read_ops = { 539static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2adea..72074e3a04f9 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
156 return end > start ? end - 1 : NFS4_MAX_UINT64; 156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157} 157}
158 158
159static struct objlayout_io_state * 159void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, 160 struct page ***p_pages, unsigned *p_pgbase,
161 struct page **pages, 161 u64 offset, unsigned long count)
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{ 162{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset; 163 u64 lseg_end_offset;
171 164
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset); 165 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset, 166 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length); 167 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset); 168 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) { 169 WARN_ON(offset + count > lseg_end_offset);
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185 170
186 if (pgbase > PAGE_SIZE) { 171 if (*p_pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT; 172 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
188 pgbase &= ~PAGE_MASK; 173 *p_pages += *p_pgbase >> PAGE_SHIFT;
174 *p_pgbase &= ~PAGE_MASK;
189 } 175 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212} 176}
213 177
214/* 178/*
215 * I/O done common code 179 * I/O done common code
216 */ 180 */
217static void 181static void
218objlayout_iodone(struct objlayout_io_state *state) 182objlayout_iodone(struct objlayout_io_res *oir)
219{ 183{
220 dprintk("%s: state %p status\n", __func__, state); 184 if (likely(oir->status >= 0)) {
221 185 objio_free_result(oir);
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else { 186 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); 187 struct objlayout *objlay = oir->objlay;
226 188
227 spin_lock(&objlay->lock); 189 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID; 190 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list); 191 list_add(&objlay->err_list, &oir->err_list);
230 spin_unlock(&objlay->lock); 192 spin_unlock(&objlay->lock);
231 } 193 }
232} 194}
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
238 * the error for later reporting at layout-return. 200 * the error for later reporting at layout-return.
239 */ 201 */
240void 202void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, 203objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error, 204 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write) 205 u64 offset, u64 length, bool is_write)
244{ 206{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; 207 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
246 208
247 BUG_ON(index >= state->num_comps); 209 BUG_ON(index >= oir->num_comps);
248 if (osd_error) { 210 if (osd_error) {
249 ioerr->oer_component = *pooid; 211 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset; 212 ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
285} 247}
286 248
287void 249void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) 250objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
289{ 251{
290 int eof = state->eof; 252 struct nfs_read_data *rdata = oir->rpcdata;
291 struct nfs_read_data *rdata;
292 253
293 state->status = status; 254 oir->status = rdata->task.tk_status = status;
294 dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); 255 if (status >= 0)
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status; 256 rdata->res.count = status;
299 rdata->res.eof = eof; 257 objlayout_iodone(oir);
300 } 258 /* must not use oir after this point */
301 objlayout_iodone(state); 259
302 /* must not use state after this point */ 260 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
261 status, rdata->res.eof, sync);
303 262
304 if (sync) 263 if (sync)
305 pnfs_ld_read_done(rdata); 264 pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
317{ 276{
318 loff_t offset = rdata->args.offset; 277 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count; 278 size_t count = rdata->args.count;
320 struct objlayout_io_state *state; 279 int err;
321 ssize_t status = 0;
322 loff_t eof; 280 loff_t eof;
323 281
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode); 282 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) { 283 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) { 284 if (offset >= eof) {
330 status = 0; 285 err = 0;
331 rdata->res.count = 0; 286 rdata->res.count = 0;
332 rdata->res.eof = 1; 287 rdata->res.eof = 1;
288 /*FIXME: do we need to call pnfs_ld_read_done() */
333 goto out; 289 goto out;
334 } 290 }
335 count = eof - offset; 291 count = eof - offset;
336 } 292 }
337 293
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, 294 rdata->res.eof = (offset + count) >= eof;
339 rdata->args.pages, rdata->args.pgbase, 295 _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
340 offset, count, 296 &rdata->args.pgbase,
341 rdata->lseg, rdata, 297 rdata->args.offset, rdata->args.count);
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347 298
348 state->eof = state->offset + state->count >= eof; 299 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
300 __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
349 301
350 status = objio_read_pagelist(state); 302 err = objio_read_pagelist(rdata);
351 out: 303 out:
352 dprintk("%s: Return status %Zd\n", __func__, status); 304 if (unlikely(err)) {
353 rdata->pnfs_error = status; 305 rdata->pnfs_error = err;
306 dprintk("%s: Returned Error %d\n", __func__, err);
307 return PNFS_NOT_ATTEMPTED;
308 }
354 return PNFS_ATTEMPTED; 309 return PNFS_ATTEMPTED;
355} 310}
356 311
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
371} 326}
372 327
373void 328void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status, 329objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
375 bool sync)
376{ 330{
377 struct nfs_write_data *wdata; 331 struct nfs_write_data *wdata = oir->rpcdata;
378 332
379 dprintk("%s: Begin\n", __func__); 333 oir->status = wdata->task.tk_status = status;
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) { 334 if (status >= 0) {
384 wdata->res.count = status; 335 wdata->res.count = status;
385 wdata->verf.committed = state->committed; 336 wdata->verf.committed = oir->committed;
386 dprintk("%s: Return status %d committed %d\n", 337 }
387 __func__, wdata->task.tk_status, 338 objlayout_iodone(oir);
388 wdata->verf.committed); 339 /* must not use oir after this point */
389 } else 340
390 dprintk("%s: Return status %d\n", 341 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
391 __func__, wdata->task.tk_status); 342 status, wdata->verf.committed, sync);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394 343
395 if (sync) 344 if (sync)
396 pnfs_ld_write_done(wdata); 345 pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata, 356objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how) 357 int how)
409{ 358{
410 struct objlayout_io_state *state; 359 int err;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427 360
428 state->sync = how & FLUSH_SYNC; 361 _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
362 &wdata->args.pgbase,
363 wdata->args.offset, wdata->args.count);
429 364
430 status = objio_write_pagelist(state, how & FLUSH_STABLE); 365 err = objio_write_pagelist(wdata, how);
431 out: 366 if (unlikely(err)) {
432 dprintk("%s: Return status %Zd\n", __func__, status); 367 wdata->pnfs_error = err;
433 wdata->pnfs_error = status; 368 dprintk("%s: Returned Error %d\n", __func__, err);
369 return PNFS_NOT_ATTEMPTED;
370 }
434 return PNFS_ATTEMPTED; 371 return PNFS_ATTEMPTED;
435} 372}
436 373
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
537static void 474static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p) 475encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{ 476{
540 struct objlayout_io_state *state, *tmp; 477 struct objlayout_io_res *oir, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; 478 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542 479
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 480 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
544 unsigned i; 481 unsigned i;
545 482
546 for (i = 0; i < state->num_comps; i++) { 483 for (i = 0; i < oir->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 484 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
548 485
549 if (!ioerr->oer_errno) 486 if (!ioerr->oer_errno)
550 continue; 487 continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
563 500
564 merge_ioerr(&accumulated_err, ioerr); 501 merge_ioerr(&accumulated_err, ioerr);
565 } 502 }
566 list_del(&state->err_list); 503 list_del(&oir->err_list);
567 objlayout_free_io_state(state); 504 objio_free_result(oir);
568 } 505 }
569 506
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); 507 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
576 const struct nfs4_layoutreturn_args *args) 513 const struct nfs4_layoutreturn_args *args)
577{ 514{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay); 515 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp; 516 struct objlayout_io_res *oir, *tmp;
580 __be32 *start; 517 __be32 *start;
581 518
582 dprintk("%s: Begin\n", __func__); 519 dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
585 522
586 spin_lock(&objlay->lock); 523 spin_lock(&objlay->lock);
587 524
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 525 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p; 526 __be32 *last_xdr = NULL, *p;
590 unsigned i; 527 unsigned i;
591 int res = 0; 528 int res = 0;
592 529
593 for (i = 0; i < state->num_comps; i++) { 530 for (i = 0; i < oir->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 531 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
595 532
596 if (!ioerr->oer_errno) 533 if (!ioerr->oer_errno)
597 continue; 534 continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
615 } 552 }
616 553
617 last_xdr = p; 554 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); 555 pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
619 } 556 }
620 557
621 /* TODO: use xdr_write_pages */ 558 /* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
631 encode_accumulated_error(objlay, last_xdr); 568 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done; 569 goto loop_done;
633 } 570 }
634 list_del(&state->err_list); 571 list_del(&oir->err_list);
635 objlayout_free_io_state(state); 572 objio_free_result(oir);
636 } 573 }
637loop_done: 574loop_done:
638 spin_unlock(&objlay->lock); 575 spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042d..8ec34727ed21 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
74 * per-I/O operation state 74 * per-I/O operation state
75 * embedded in objects provider io_state data structure 75 * embedded in objects provider io_state data structure
76 */ 76 */
77struct objlayout_io_state { 77struct objlayout_io_res {
78 struct pnfs_layout_segment *lseg; 78 struct objlayout *objlay;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86 79
87 void *rpcdata; 80 void *rpcdata;
88 int status; /* res */ 81 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */ 82 int committed; /* res */
91 83
92 /* Error reporting (layout_return) */ 84 /* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
100 struct pnfs_osd_ioerr *ioerrs; 92 struct pnfs_osd_ioerr *ioerrs;
101}; 93};
102 94
95static inline
96void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
97 struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
98 struct pnfs_layout_hdr *pnfs_layout_type)
99{
100 oir->objlay = OBJLAYOUT(pnfs_layout_type);
101 oir->rpcdata = rpcdata;
102 INIT_LIST_HEAD(&oir->err_list);
103 oir->num_comps = num_comps;
104 oir->ioerrs = ioerrs;
105}
106
103/* 107/*
104 * Raid engine I/O API 108 * Raid engine I/O API
105 */ 109 */
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
110 gfp_t gfp_flags); 114 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg); 115extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112 116
113extern int objio_alloc_io_state( 117/* objio_free_result will free these @oir structs recieved from
114 struct pnfs_layout_segment *lseg, 118 * objlayout_{read,write}_done
115 struct objlayout_io_state **outp, 119 */
116 gfp_t gfp_flags); 120extern void objio_free_result(struct objlayout_io_res *oir);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118 121
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); 122extern int objio_read_pagelist(struct nfs_read_data *rdata);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, 123extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
121 bool stable);
122 124
123/* 125/*
124 * callback API 126 * callback API
125 */ 127 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state, 128extern void objlayout_io_set_result(struct objlayout_io_res *oir,
127 unsigned index, struct pnfs_osd_objid *pooid, 129 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write); 130 int osd_error, u64 offset, u64 length, bool is_write);
129 131
130static inline void 132static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) 133objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
132{ 134{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was 135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate 136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported. 137 * the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
144 spin_unlock(&objlay->lock); 144 spin_unlock(&objlay->lock);
145} 145}
146 146
147extern void objlayout_read_done(struct objlayout_io_state *state, 147extern void objlayout_read_done(struct objlayout_io_res *oir,
148 ssize_t status, bool sync); 148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, 152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1f..0a5ff5c19511 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -41,7 +41,7 @@ nfs_page_free(struct nfs_page *p)
41 41
42/** 42/**
43 * nfs_create_request - Create an NFS read/write request. 43 * nfs_create_request - Create an NFS read/write request.
44 * @file: file descriptor to use 44 * @ctx: open context to use
45 * @inode: inode to which the request is attached 45 * @inode: inode to which the request is attached
46 * @page: page to write 46 * @page: page to write
47 * @offset: starting offset within the page for the write 47 * @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ee73d9a4f700..a2478bc74442 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1443,17 +1443,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1443 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1443 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1444 data = kzalloc(sizeof(*data), GFP_NOFS); 1444 data = kzalloc(sizeof(*data), GFP_NOFS);
1445 if (!data) { 1445 if (!data) {
1446 mark_inode_dirty_sync(inode);
1447 status = -ENOMEM; 1446 status = -ENOMEM;
1448 goto out; 1447 goto out;
1449 } 1448 }
1450 1449
1450 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1451 goto out_free;
1452
1453 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1454 if (!sync) {
1455 status = -EAGAIN;
1456 goto out_free;
1457 }
1458 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
1459 nfs_wait_bit_killable, TASK_KILLABLE);
1460 if (status)
1461 goto out_free;
1462 }
1463
1451 INIT_LIST_HEAD(&data->lseg_list); 1464 INIT_LIST_HEAD(&data->lseg_list);
1452 spin_lock(&inode->i_lock); 1465 spin_lock(&inode->i_lock);
1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1466 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1467 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
1454 spin_unlock(&inode->i_lock); 1468 spin_unlock(&inode->i_lock);
1455 kfree(data); 1469 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1456 goto out; 1470 goto out_free;
1457 } 1471 }
1458 1472
1459 pnfs_list_write_lseg(inode, &data->lseg_list); 1473 pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1489,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1475 1489
1476 status = nfs4_proc_layoutcommit(data, sync); 1490 status = nfs4_proc_layoutcommit(data, sync);
1477out: 1491out:
1492 if (status)
1493 mark_inode_dirty_sync(inode);
1478 dprintk("<-- %s status %d\n", __func__, status); 1494 dprintk("<-- %s status %d\n", __func__, status);
1479 return status; 1495 return status;
1496out_free:
1497 kfree(data);
1498 goto out;
1480} 1499}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2219c88d96b2..b016b8a36399 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1243,7 +1243,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1243{ 1243{
1244 struct nfs_writeargs *argp = &data->args; 1244 struct nfs_writeargs *argp = &data->args;
1245 struct nfs_writeres *resp = &data->res; 1245 struct nfs_writeres *resp = &data->res;
1246 struct nfs_server *server = NFS_SERVER(data->inode);
1247 int status; 1246 int status;
1248 1247
1249 dprintk("NFS: %5u nfs_writeback_done (status %d)\n", 1248 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1277,7 +1276,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1277 if (time_before(complain, jiffies)) { 1276 if (time_before(complain, jiffies)) {
1278 dprintk("NFS: faulty NFS server %s:" 1277 dprintk("NFS: faulty NFS server %s:"
1279 " (committed = %d) != (stable = %d)\n", 1278 " (committed = %d) != (stable = %d)\n",
1280 server->nfs_client->cl_hostname, 1279 NFS_SERVER(data->inode)->nfs_client->cl_hostname,
1281 resp->verf->committed, argp->stable); 1280 resp->verf->committed, argp->stable);
1282 complain = jiffies + 300 * HZ; 1281 complain = jiffies + 300 * HZ;
1283 } 1282 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b1..52cd976b6099 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -256,6 +256,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
256 nfsd_serv = NULL; 256 nfsd_serv = NULL;
257 nfsd_shutdown(); 257 nfsd_shutdown();
258 258
259 svc_rpcb_cleanup(serv);
260
259 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 261 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
260 "cache\n"); 262 "cache\n");
261 nfsd_export_flush(); 263 nfsd_export_flush();
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 60a137b7f171..ab2c6343361a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -229,6 +229,7 @@ struct nfs_inode {
229#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ 229#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */
230#define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */ 230#define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */
231#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ 231#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
232#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
232 233
233static inline struct nfs_inode *NFS_I(const struct inode *inode) 234static inline struct nfs_inode *NFS_I(const struct inode *inode)
234{ 235{
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 492486a74484..3d8f9c44e27d 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -136,6 +136,8 @@ void rpc_shutdown_client(struct rpc_clnt *);
136void rpc_release_client(struct rpc_clnt *); 136void rpc_release_client(struct rpc_clnt *);
137void rpc_task_release_client(struct rpc_task *); 137void rpc_task_release_client(struct rpc_task *);
138 138
139int rpcb_create_local(void);
140void rpcb_put_local(void);
139int rpcb_register(u32, u32, int, unsigned short); 141int rpcb_register(u32, u32, int, unsigned short);
140int rpcb_v4_register(const u32 program, const u32 version, 142int rpcb_v4_register(const u32 program, const u32 version,
141 const struct sockaddr *address, 143 const struct sockaddr *address,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index d8d5d93071b3..35b37b1e9299 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -413,6 +413,7 @@ struct svc_procedure {
413/* 413/*
414 * Function prototypes. 414 * Function prototypes.
415 */ 415 */
416void svc_rpcb_cleanup(struct svc_serv *serv);
416struct svc_serv *svc_create(struct svc_program *, unsigned int, 417struct svc_serv *svc_create(struct svc_program *, unsigned int,
417 void (*shutdown)(struct svc_serv *)); 418 void (*shutdown)(struct svc_serv *));
418struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, 419struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 4cb70dc6e7ad..e50502d8ceb7 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -129,6 +129,9 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
129 for (i = 0; i < groups ; i++) 129 for (i = 0; i < groups ; i++)
130 if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) 130 if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i))
131 return 0; 131 return 0;
132 if (groups < NFS_NGROUPS &&
133 cred->uc_gids[groups] != NOGROUP)
134 return 0;
132 return 1; 135 return 1;
133} 136}
134 137
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index f588b852d41c..8761bf8e36fc 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -114,6 +114,9 @@ static struct rpc_program rpcb_program;
114static struct rpc_clnt * rpcb_local_clnt; 114static struct rpc_clnt * rpcb_local_clnt;
115static struct rpc_clnt * rpcb_local_clnt4; 115static struct rpc_clnt * rpcb_local_clnt4;
116 116
117DEFINE_SPINLOCK(rpcb_clnt_lock);
118unsigned int rpcb_users;
119
117struct rpcbind_args { 120struct rpcbind_args {
118 struct rpc_xprt * r_xprt; 121 struct rpc_xprt * r_xprt;
119 122
@@ -161,6 +164,56 @@ static void rpcb_map_release(void *data)
161 kfree(map); 164 kfree(map);
162} 165}
163 166
167static int rpcb_get_local(void)
168{
169 int cnt;
170
171 spin_lock(&rpcb_clnt_lock);
172 if (rpcb_users)
173 rpcb_users++;
174 cnt = rpcb_users;
175 spin_unlock(&rpcb_clnt_lock);
176
177 return cnt;
178}
179
180void rpcb_put_local(void)
181{
182 struct rpc_clnt *clnt = rpcb_local_clnt;
183 struct rpc_clnt *clnt4 = rpcb_local_clnt4;
184 int shutdown;
185
186 spin_lock(&rpcb_clnt_lock);
187 if (--rpcb_users == 0) {
188 rpcb_local_clnt = NULL;
189 rpcb_local_clnt4 = NULL;
190 }
191 shutdown = !rpcb_users;
192 spin_unlock(&rpcb_clnt_lock);
193
194 if (shutdown) {
195 /*
196 * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister
197 */
198 if (clnt4)
199 rpc_shutdown_client(clnt4);
200 if (clnt)
201 rpc_shutdown_client(clnt);
202 }
203}
204
205static void rpcb_set_local(struct rpc_clnt *clnt, struct rpc_clnt *clnt4)
206{
207 /* Protected by rpcb_create_local_mutex */
208 rpcb_local_clnt = clnt;
209 rpcb_local_clnt4 = clnt4;
210 smp_wmb();
211 rpcb_users = 1;
212 dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
213 "%p, rpcb_local_clnt4: %p)\n", rpcb_local_clnt,
214 rpcb_local_clnt4);
215}
216
164/* 217/*
165 * Returns zero on success, otherwise a negative errno value 218 * Returns zero on success, otherwise a negative errno value
166 * is returned. 219 * is returned.
@@ -205,9 +258,7 @@ static int rpcb_create_local_unix(void)
205 clnt4 = NULL; 258 clnt4 = NULL;
206 } 259 }
207 260
208 /* Protected by rpcb_create_local_mutex */ 261 rpcb_set_local(clnt, clnt4);
209 rpcb_local_clnt = clnt;
210 rpcb_local_clnt4 = clnt4;
211 262
212out: 263out:
213 return result; 264 return result;
@@ -259,9 +310,7 @@ static int rpcb_create_local_net(void)
259 clnt4 = NULL; 310 clnt4 = NULL;
260 } 311 }
261 312
262 /* Protected by rpcb_create_local_mutex */ 313 rpcb_set_local(clnt, clnt4);
263 rpcb_local_clnt = clnt;
264 rpcb_local_clnt4 = clnt4;
265 314
266out: 315out:
267 return result; 316 return result;
@@ -271,16 +320,16 @@ out:
271 * Returns zero on success, otherwise a negative errno value 320 * Returns zero on success, otherwise a negative errno value
272 * is returned. 321 * is returned.
273 */ 322 */
274static int rpcb_create_local(void) 323int rpcb_create_local(void)
275{ 324{
276 static DEFINE_MUTEX(rpcb_create_local_mutex); 325 static DEFINE_MUTEX(rpcb_create_local_mutex);
277 int result = 0; 326 int result = 0;
278 327
279 if (rpcb_local_clnt) 328 if (rpcb_get_local())
280 return result; 329 return result;
281 330
282 mutex_lock(&rpcb_create_local_mutex); 331 mutex_lock(&rpcb_create_local_mutex);
283 if (rpcb_local_clnt) 332 if (rpcb_get_local())
284 goto out; 333 goto out;
285 334
286 if (rpcb_create_local_unix() != 0) 335 if (rpcb_create_local_unix() != 0)
@@ -382,11 +431,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
382 struct rpc_message msg = { 431 struct rpc_message msg = {
383 .rpc_argp = &map, 432 .rpc_argp = &map,
384 }; 433 };
385 int error;
386
387 error = rpcb_create_local();
388 if (error)
389 return error;
390 434
391 dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " 435 dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
392 "rpcbind\n", (port ? "" : "un"), 436 "rpcbind\n", (port ? "" : "un"),
@@ -522,11 +566,7 @@ int rpcb_v4_register(const u32 program, const u32 version,
522 struct rpc_message msg = { 566 struct rpc_message msg = {
523 .rpc_argp = &map, 567 .rpc_argp = &map,
524 }; 568 };
525 int error;
526 569
527 error = rpcb_create_local();
528 if (error)
529 return error;
530 if (rpcb_local_clnt4 == NULL) 570 if (rpcb_local_clnt4 == NULL)
531 return -EPROTONOSUPPORT; 571 return -EPROTONOSUPPORT;
532 572
@@ -1060,15 +1100,3 @@ static struct rpc_program rpcb_program = {
1060 .version = rpcb_version, 1100 .version = rpcb_version,
1061 .stats = &rpcb_stats, 1101 .stats = &rpcb_stats,
1062}; 1102};
1063
1064/**
1065 * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister
1066 *
1067 */
1068void cleanup_rpcb_clnt(void)
1069{
1070 if (rpcb_local_clnt4)
1071 rpc_shutdown_client(rpcb_local_clnt4);
1072 if (rpcb_local_clnt)
1073 rpc_shutdown_client(rpcb_local_clnt);
1074}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 9d0809160994..8ec9778c3f4a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -61,8 +61,6 @@ static struct pernet_operations sunrpc_net_ops = {
61 61
62extern struct cache_detail unix_gid_cache; 62extern struct cache_detail unix_gid_cache;
63 63
64extern void cleanup_rpcb_clnt(void);
65
66static int __init 64static int __init
67init_sunrpc(void) 65init_sunrpc(void)
68{ 66{
@@ -102,7 +100,6 @@ out:
102static void __exit 100static void __exit
103cleanup_sunrpc(void) 101cleanup_sunrpc(void)
104{ 102{
105 cleanup_rpcb_clnt();
106 rpcauth_remove_module(); 103 rpcauth_remove_module();
107 cleanup_socket_xprt(); 104 cleanup_socket_xprt();
108 svc_cleanup_xprt_sock(); 105 svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index dd5cc00ed559..6e038884ae0c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -366,6 +366,42 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
366 return &serv->sv_pools[pidx % serv->sv_nrpools]; 366 return &serv->sv_pools[pidx % serv->sv_nrpools];
367} 367}
368 368
369static int svc_rpcb_setup(struct svc_serv *serv)
370{
371 int err;
372
373 err = rpcb_create_local();
374 if (err)
375 return err;
376
377 /* Remove any stale portmap registrations */
378 svc_unregister(serv);
379 return 0;
380}
381
382void svc_rpcb_cleanup(struct svc_serv *serv)
383{
384 svc_unregister(serv);
385 rpcb_put_local();
386}
387EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
388
389static int svc_uses_rpcbind(struct svc_serv *serv)
390{
391 struct svc_program *progp;
392 unsigned int i;
393
394 for (progp = serv->sv_program; progp; progp = progp->pg_next) {
395 for (i = 0; i < progp->pg_nvers; i++) {
396 if (progp->pg_vers[i] == NULL)
397 continue;
398 if (progp->pg_vers[i]->vs_hidden == 0)
399 return 1;
400 }
401 }
402
403 return 0;
404}
369 405
370/* 406/*
371 * Create an RPC service 407 * Create an RPC service
@@ -431,8 +467,15 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
431 spin_lock_init(&pool->sp_lock); 467 spin_lock_init(&pool->sp_lock);
432 } 468 }
433 469
434 /* Remove any stale portmap registrations */ 470 if (svc_uses_rpcbind(serv)) {
435 svc_unregister(serv); 471 if (svc_rpcb_setup(serv) < 0) {
472 kfree(serv->sv_pools);
473 kfree(serv);
474 return NULL;
475 }
476 if (!serv->sv_shutdown)
477 serv->sv_shutdown = svc_rpcb_cleanup;
478 }
436 479
437 return serv; 480 return serv;
438} 481}
@@ -500,7 +543,6 @@ svc_destroy(struct svc_serv *serv)
500 if (svc_serv_is_pooled(serv)) 543 if (svc_serv_is_pooled(serv))
501 svc_pool_map_put(); 544 svc_pool_map_put();
502 545
503 svc_unregister(serv);
504 kfree(serv->sv_pools); 546 kfree(serv->sv_pools);
505 kfree(serv); 547 kfree(serv);
506} 548}