aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1386
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c285
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/direct.c14
-rw-r--r--fs/nfs/file.c52
-rw-r--r--fs/nfs/filelayout/filelayout.c34
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs4proc.c136
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/nfs4xdr.c179
-rw-r--r--fs/nfs/objlayout/objio_osd.c113
-rw-r--r--fs/nfs/objlayout/objlayout.c70
-rw-r--r--fs/nfs/objlayout/objlayout.h5
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c105
-rw-r--r--fs/nfs/pnfs.h50
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c150
-rw-r--r--include/linux/nfs_fs.h41
-rw-r--r--include/linux/nfs_xdr.h17
-rw-r--r--include/linux/pagemap.h12
-rw-r--r--include/linux/sunrpc/xprt.h1
-rw-r--r--include/linux/wait.h5
-rw-r--r--kernel/sched/wait.c36
-rw-r--r--mm/filemap.c21
-rw-r--r--net/sunrpc/clnt.c2
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/sunrpc/xprtrdma/transport.c2
-rw-r--r--net/sunrpc/xprtsock.c121
47 files changed, 2498 insertions, 3289 deletions
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
159 159
160 msg.rpc_proc = &clnt->cl_procinfo[proc]; 160 msg.rpc_proc = &clnt->cl_procinfo[proc];
161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); 161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
162 if (status == -ECONNREFUSED) {
163 dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
164 status);
165 rpc_force_rebind(clnt);
166 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
167 }
162 if (status < 0) 168 if (status < 0)
163 dprintk("lockd: NSM upcall RPC failed, status=%d\n", 169 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
164 status); 170 status);
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
2# Makefile for the pNFS block layout driver kernel module 2# Makefile for the pNFS block layout driver kernel module
3# 3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o 4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o 5
6blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h> 38#include <linux/prefetch.h>
40#include <linux/pagevec.h> 39#include <linux/pagevec.h>
41 40
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
51MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
52 51
53static void print_page(struct page *page) 52static bool is_hole(struct pnfs_block_extent *be)
54{ 53{
55 dprintk("PRINTPAGE page %p\n", page); 54 switch (be->be_state) {
56 dprintk(" PagePrivate %d\n", PagePrivate(page)); 55 case PNFS_BLOCK_NONE_DATA:
57 dprintk(" PageUptodate %d\n", PageUptodate(page)); 56 return true;
58 dprintk(" PageError %d\n", PageError(page)); 57 case PNFS_BLOCK_INVALID_DATA:
59 dprintk(" PageDirty %d\n", PageDirty(page)); 58 return be->be_tag ? false : true;
60 dprintk(" PageReferenced %d\n", PageReferenced(page)); 59 default:
61 dprintk(" PageLocked %d\n", PageLocked(page)); 60 return false;
62 dprintk(" PageWriteback %d\n", PageWriteback(page)); 61 }
63 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
64 dprintk("\n");
65}
66
67/* Given the be associated with isect, determine if page data needs to be
68 * initialized.
69 */
70static int is_hole(struct pnfs_block_extent *be, sector_t isect)
71{
72 if (be->be_state == PNFS_BLOCK_NONE_DATA)
73 return 1;
74 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
75 return 0;
76 else
77 return !bl_is_sector_init(be->be_inval, isect);
78}
79
80/* Given the be associated with isect, determine if page data can be
81 * written to disk.
82 */
83static int is_writable(struct pnfs_block_extent *be, sector_t isect)
84{
85 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
86 be->be_state == PNFS_BLOCK_INVALID_DATA);
87} 62}
88 63
89/* The data we are handed might be spread across several bios. We need 64/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
91 */ 66 */
92struct parallel_io { 67struct parallel_io {
93 struct kref refcnt; 68 struct kref refcnt;
94 void (*pnfs_callback) (void *data, int num_se); 69 void (*pnfs_callback) (void *data);
95 void *data; 70 void *data;
96 int bse_count;
97}; 71};
98 72
99static inline struct parallel_io *alloc_parallel(void *data) 73static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
104 if (rv) { 78 if (rv) {
105 rv->data = data; 79 rv->data = data;
106 kref_init(&rv->refcnt); 80 kref_init(&rv->refcnt);
107 rv->bse_count = 0;
108 } 81 }
109 return rv; 82 return rv;
110} 83}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
119 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 92 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
120 93
121 dprintk("%s enter\n", __func__); 94 dprintk("%s enter\n", __func__);
122 p->pnfs_callback(p->data, p->bse_count); 95 p->pnfs_callback(p->data);
123 kfree(p); 96 kfree(p);
124} 97}
125 98
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
141 return NULL; 114 return NULL;
142} 115}
143 116
144static struct bio *bl_alloc_init_bio(int npg, sector_t isect, 117static struct bio *
145 struct pnfs_block_extent *be, 118bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
146 void (*end_io)(struct bio *, int err), 119 void (*end_io)(struct bio *, int err), struct parallel_io *par)
147 struct parallel_io *par)
148{ 120{
149 struct bio *bio; 121 struct bio *bio;
150 122
@@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
156 } 128 }
157 129
158 if (bio) { 130 if (bio) {
159 bio->bi_iter.bi_sector = isect - be->be_f_offset + 131 bio->bi_iter.bi_sector = disk_sector;
160 be->be_v_offset; 132 bio->bi_bdev = bdev;
161 bio->bi_bdev = be->be_mdev;
162 bio->bi_end_io = end_io; 133 bio->bi_end_io = end_io;
163 bio->bi_private = par; 134 bio->bi_private = par;
164 } 135 }
165 return bio; 136 return bio;
166} 137}
167 138
168static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, 139static struct bio *
169 sector_t isect, struct page *page, 140do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
170 struct pnfs_block_extent *be, 141 struct page *page, struct pnfs_block_dev_map *map,
171 void (*end_io)(struct bio *, int err), 142 struct pnfs_block_extent *be,
172 struct parallel_io *par, 143 void (*end_io)(struct bio *, int err),
173 unsigned int offset, int len) 144 struct parallel_io *par, unsigned int offset, int *len)
174{ 145{
175 isect = isect + (offset >> SECTOR_SHIFT); 146 struct pnfs_block_dev *dev =
147 container_of(be->be_device, struct pnfs_block_dev, node);
148 u64 disk_addr, end;
149
176 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, 150 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
177 npg, rw, (unsigned long long)isect, offset, len); 151 npg, rw, (unsigned long long)isect, offset, *len);
152
153 /* translate to device offset */
154 isect += be->be_v_offset;
155 isect -= be->be_f_offset;
156
157 /* translate to physical disk offset */
158 disk_addr = (u64)isect << SECTOR_SHIFT;
159 if (disk_addr < map->start || disk_addr >= map->start + map->len) {
160 if (!dev->map(dev, disk_addr, map))
161 return ERR_PTR(-EIO);
162 bio = bl_submit_bio(rw, bio);
163 }
164 disk_addr += map->disk_offset;
165 disk_addr -= map->start;
166
167 /* limit length to what the device mapping allows */
168 end = disk_addr + *len;
169 if (end >= map->start + map->len)
170 *len = map->start + map->len - disk_addr;
171
178retry: 172retry:
179 if (!bio) { 173 if (!bio) {
180 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 174 bio = bl_alloc_init_bio(npg, map->bdev,
175 disk_addr >> SECTOR_SHIFT, end_io, par);
181 if (!bio) 176 if (!bio)
182 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
183 } 178 }
184 if (bio_add_page(bio, page, len, offset) < len) { 179 if (bio_add_page(bio, page, *len, offset) < *len) {
185 bio = bl_submit_bio(rw, bio); 180 bio = bl_submit_bio(rw, bio);
186 goto retry; 181 goto retry;
187 } 182 }
188 return bio; 183 return bio;
189} 184}
190 185
191static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
192 sector_t isect, struct page *page,
193 struct pnfs_block_extent *be,
194 void (*end_io)(struct bio *, int err),
195 struct parallel_io *par)
196{
197 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
198 end_io, par, 0, PAGE_CACHE_SIZE);
199}
200
201/* This is basically copied from mpage_end_io_read */
202static void bl_end_io_read(struct bio *bio, int err) 186static void bl_end_io_read(struct bio *bio, int err)
203{ 187{
204 struct parallel_io *par = bio->bi_private; 188 struct parallel_io *par = bio->bi_private;
205 struct bio_vec *bvec;
206 int i;
207
208 if (!err)
209 bio_for_each_segment_all(bvec, bio, i)
210 SetPageUptodate(bvec->bv_page);
211 189
212 if (err) { 190 if (err) {
213 struct nfs_pgio_header *header = par->data; 191 struct nfs_pgio_header *header = par->data;
@@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err)
216 header->pnfs_error = -EIO; 194 header->pnfs_error = -EIO;
217 pnfs_set_lo_fail(header->lseg); 195 pnfs_set_lo_fail(header->lseg);
218 } 196 }
197
219 bio_put(bio); 198 bio_put(bio);
220 put_parallel(par); 199 put_parallel(par);
221} 200}
@@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work)
231} 210}
232 211
233static void 212static void
234bl_end_par_io_read(void *data, int unused) 213bl_end_par_io_read(void *data)
235{ 214{
236 struct nfs_pgio_header *hdr = data; 215 struct nfs_pgio_header *hdr = data;
237 216
@@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused)
241} 220}
242 221
243static enum pnfs_try_status 222static enum pnfs_try_status
244bl_read_pagelist(struct nfs_pgio_header *hdr) 223bl_read_pagelist(struct nfs_pgio_header *header)
245{ 224{
246 struct nfs_pgio_header *header = hdr; 225 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
247 int i, hole; 226 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
248 struct bio *bio = NULL; 227 struct bio *bio = NULL;
249 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 228 struct pnfs_block_extent be;
250 sector_t isect, extent_length = 0; 229 sector_t isect, extent_length = 0;
251 struct parallel_io *par; 230 struct parallel_io *par;
252 loff_t f_offset = hdr->args.offset; 231 loff_t f_offset = header->args.offset;
253 size_t bytes_left = hdr->args.count; 232 size_t bytes_left = header->args.count;
254 unsigned int pg_offset, pg_len; 233 unsigned int pg_offset, pg_len;
255 struct page **pages = hdr->args.pages; 234 struct page **pages = header->args.pages;
256 int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT; 235 int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
257 const bool is_dio = (header->dreq != NULL); 236 const bool is_dio = (header->dreq != NULL);
237 struct blk_plug plug;
238 int i;
258 239
259 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 240 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
260 hdr->page_array.npages, f_offset, 241 header->page_array.npages, f_offset,
261 (unsigned int)hdr->args.count); 242 (unsigned int)header->args.count);
262 243
263 par = alloc_parallel(hdr); 244 par = alloc_parallel(header);
264 if (!par) 245 if (!par)
265 goto use_mds; 246 return PNFS_NOT_ATTEMPTED;
266 par->pnfs_callback = bl_end_par_io_read; 247 par->pnfs_callback = bl_end_par_io_read;
267 /* At this point, we can no longer jump to use_mds */ 248
249 blk_start_plug(&plug);
268 250
269 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 251 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
270 /* Code assumes extents are page-aligned */ 252 /* Code assumes extents are page-aligned */
271 for (i = pg_index; i < hdr->page_array.npages; i++) { 253 for (i = pg_index; i < header->page_array.npages; i++) {
272 if (!extent_length) { 254 if (extent_length <= 0) {
273 /* We've used up the previous extent */ 255 /* We've used up the previous extent */
274 bl_put_extent(be);
275 bl_put_extent(cow_read);
276 bio = bl_submit_bio(READ, bio); 256 bio = bl_submit_bio(READ, bio);
257
277 /* Get the next one */ 258 /* Get the next one */
278 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 259 if (!ext_tree_lookup(bl, isect, &be, false)) {
279 isect, &cow_read);
280 if (!be) {
281 header->pnfs_error = -EIO; 260 header->pnfs_error = -EIO;
282 goto out; 261 goto out;
283 } 262 }
284 extent_length = be->be_length - 263 extent_length = be.be_length - (isect - be.be_f_offset);
285 (isect - be->be_f_offset);
286 if (cow_read) {
287 sector_t cow_length = cow_read->be_length -
288 (isect - cow_read->be_f_offset);
289 extent_length = min(extent_length, cow_length);
290 }
291 } 264 }
292 265
266 pg_offset = f_offset & ~PAGE_CACHE_MASK;
293 if (is_dio) { 267 if (is_dio) {
294 pg_offset = f_offset & ~PAGE_CACHE_MASK;
295 if (pg_offset + bytes_left > PAGE_CACHE_SIZE) 268 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
296 pg_len = PAGE_CACHE_SIZE - pg_offset; 269 pg_len = PAGE_CACHE_SIZE - pg_offset;
297 else 270 else
298 pg_len = bytes_left; 271 pg_len = bytes_left;
299
300 f_offset += pg_len;
301 bytes_left -= pg_len;
302 isect += (pg_offset >> SECTOR_SHIFT);
303 } else { 272 } else {
304 pg_offset = 0; 273 BUG_ON(pg_offset != 0);
305 pg_len = PAGE_CACHE_SIZE; 274 pg_len = PAGE_CACHE_SIZE;
306 } 275 }
307 276
308 hole = is_hole(be, isect); 277 isect += (pg_offset >> SECTOR_SHIFT);
309 if (hole && !cow_read) { 278 extent_length -= (pg_offset >> SECTOR_SHIFT);
279
280 if (is_hole(&be)) {
310 bio = bl_submit_bio(READ, bio); 281 bio = bl_submit_bio(READ, bio);
311 /* Fill hole w/ zeroes w/o accessing device */ 282 /* Fill hole w/ zeroes w/o accessing device */
312 dprintk("%s Zeroing page for hole\n", __func__); 283 dprintk("%s Zeroing page for hole\n", __func__);
313 zero_user_segment(pages[i], pg_offset, pg_len); 284 zero_user_segment(pages[i], pg_offset, pg_len);
314 print_page(pages[i]);
315 SetPageUptodate(pages[i]);
316 } else {
317 struct pnfs_block_extent *be_read;
318 285
319 be_read = (hole && cow_read) ? cow_read : be; 286 /* invalidate map */
287 map.start = NFS4_MAX_UINT64;
288 } else {
320 bio = do_add_page_to_bio(bio, 289 bio = do_add_page_to_bio(bio,
321 hdr->page_array.npages - i, 290 header->page_array.npages - i,
322 READ, 291 READ,
323 isect, pages[i], be_read, 292 isect, pages[i], &map, &be,
324 bl_end_io_read, par, 293 bl_end_io_read, par,
325 pg_offset, pg_len); 294 pg_offset, &pg_len);
326 if (IS_ERR(bio)) { 295 if (IS_ERR(bio)) {
327 header->pnfs_error = PTR_ERR(bio); 296 header->pnfs_error = PTR_ERR(bio);
328 bio = NULL; 297 bio = NULL;
@@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
330 } 299 }
331 } 300 }
332 isect += (pg_len >> SECTOR_SHIFT); 301 isect += (pg_len >> SECTOR_SHIFT);
333 extent_length -= PAGE_CACHE_SECTORS; 302 extent_length -= (pg_len >> SECTOR_SHIFT);
303 f_offset += pg_len;
304 bytes_left -= pg_len;
334 } 305 }
335 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 306 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
336 hdr->res.eof = 1; 307 header->res.eof = 1;
337 hdr->res.count = header->inode->i_size - hdr->args.offset; 308 header->res.count = header->inode->i_size - header->args.offset;
338 } else { 309 } else {
339 hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset; 310 header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
340 } 311 }
341out: 312out:
342 bl_put_extent(be);
343 bl_put_extent(cow_read);
344 bl_submit_bio(READ, bio); 313 bl_submit_bio(READ, bio);
314 blk_finish_plug(&plug);
345 put_parallel(par); 315 put_parallel(par);
346 return PNFS_ATTEMPTED; 316 return PNFS_ATTEMPTED;
347
348 use_mds:
349 dprintk("Giving up and using normal NFS\n");
350 return PNFS_NOT_ATTEMPTED;
351}
352
353static void mark_extents_written(struct pnfs_block_layout *bl,
354 __u64 offset, __u32 count)
355{
356 sector_t isect, end;
357 struct pnfs_block_extent *be;
358 struct pnfs_block_short_extent *se;
359
360 dprintk("%s(%llu, %u)\n", __func__, offset, count);
361 if (count == 0)
362 return;
363 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
364 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
365 end >>= SECTOR_SHIFT;
366 while (isect < end) {
367 sector_t len;
368 be = bl_find_get_extent(bl, isect, NULL);
369 BUG_ON(!be); /* FIXME */
370 len = min(end, be->be_f_offset + be->be_length) - isect;
371 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
372 se = bl_pop_one_short_extent(be->be_inval);
373 BUG_ON(!se);
374 bl_mark_for_commit(be, isect, len, se);
375 }
376 isect += len;
377 bl_put_extent(be);
378 }
379}
380
381static void bl_end_io_write_zero(struct bio *bio, int err)
382{
383 struct parallel_io *par = bio->bi_private;
384 struct bio_vec *bvec;
385 int i;
386
387 bio_for_each_segment_all(bvec, bio, i) {
388 /* This is the zeroing page we added */
389 end_page_writeback(bvec->bv_page);
390 page_cache_release(bvec->bv_page);
391 }
392
393 if (unlikely(err)) {
394 struct nfs_pgio_header *header = par->data;
395
396 if (!header->pnfs_error)
397 header->pnfs_error = -EIO;
398 pnfs_set_lo_fail(header->lseg);
399 }
400 bio_put(bio);
401 put_parallel(par);
402} 317}
403 318
404static void bl_end_io_write(struct bio *bio, int err) 319static void bl_end_io_write(struct bio *bio, int err)
@@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
421 */ 336 */
422static void bl_write_cleanup(struct work_struct *work) 337static void bl_write_cleanup(struct work_struct *work)
423{ 338{
424 struct rpc_task *task; 339 struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
425 struct nfs_pgio_header *hdr; 340 struct nfs_pgio_header *hdr =
341 container_of(task, struct nfs_pgio_header, task);
342
426 dprintk("%s enter\n", __func__); 343 dprintk("%s enter\n", __func__);
427 task = container_of(work, struct rpc_task, u.tk_work); 344
428 hdr = container_of(task, struct nfs_pgio_header, task);
429 if (likely(!hdr->pnfs_error)) { 345 if (likely(!hdr->pnfs_error)) {
430 /* Marks for LAYOUTCOMMIT */ 346 struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
431 mark_extents_written(BLK_LSEG2EXT(hdr->lseg), 347 u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
432 hdr->args.offset, hdr->args.count); 348 u64 end = (hdr->args.offset + hdr->args.count +
349 PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
350
351 ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
352 (end - start) >> SECTOR_SHIFT);
433 } 353 }
354
434 pnfs_ld_write_done(hdr); 355 pnfs_ld_write_done(hdr);
435} 356}
436 357
437/* Called when last of bios associated with a bl_write_pagelist call finishes */ 358/* Called when last of bios associated with a bl_write_pagelist call finishes */
438static void bl_end_par_io_write(void *data, int num_se) 359static void bl_end_par_io_write(void *data)
439{ 360{
440 struct nfs_pgio_header *hdr = data; 361 struct nfs_pgio_header *hdr = data;
441 362
442 if (unlikely(hdr->pnfs_error)) {
443 bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
444 num_se);
445 }
446
447 hdr->task.tk_status = hdr->pnfs_error; 363 hdr->task.tk_status = hdr->pnfs_error;
448 hdr->verf.committed = NFS_FILE_SYNC; 364 hdr->verf.committed = NFS_FILE_SYNC;
449 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); 365 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
450 schedule_work(&hdr->task.u.tk_work); 366 schedule_work(&hdr->task.u.tk_work);
451} 367}
452 368
453/* FIXME STUB - mark intersection of layout and page as bad, so is not
454 * used again.
455 */
456static void mark_bad_read(void)
457{
458 return;
459}
460
461/*
462 * map_block: map a requested I/0 block (isect) into an offset in the LVM
463 * block_device
464 */
465static void
466map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
467{
468 dprintk("%s enter be=%p\n", __func__, be);
469
470 set_buffer_mapped(bh);
471 bh->b_bdev = be->be_mdev;
472 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
473 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
474
475 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
476 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
477 bh->b_size);
478 return;
479}
480
481static void
482bl_read_single_end_io(struct bio *bio, int error)
483{
484 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
485 struct page *page = bvec->bv_page;
486
487 /* Only one page in bvec */
488 unlock_page(page);
489}
490
491static int
492bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
493 unsigned int offset, unsigned int len)
494{
495 struct bio *bio;
496 struct page *shadow_page;
497 sector_t isect;
498 char *kaddr, *kshadow_addr;
499 int ret = 0;
500
501 dprintk("%s: offset %u len %u\n", __func__, offset, len);
502
503 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
504 if (shadow_page == NULL)
505 return -ENOMEM;
506
507 bio = bio_alloc(GFP_NOIO, 1);
508 if (bio == NULL)
509 return -ENOMEM;
510
511 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
512 (offset / SECTOR_SIZE);
513
514 bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
515 bio->bi_bdev = be->be_mdev;
516 bio->bi_end_io = bl_read_single_end_io;
517
518 lock_page(shadow_page);
519 if (bio_add_page(bio, shadow_page,
520 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
521 unlock_page(shadow_page);
522 bio_put(bio);
523 return -EIO;
524 }
525
526 submit_bio(READ, bio);
527 wait_on_page_locked(shadow_page);
528 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
529 ret = -EIO;
530 } else {
531 kaddr = kmap_atomic(page);
532 kshadow_addr = kmap_atomic(shadow_page);
533 memcpy(kaddr + offset, kshadow_addr + offset, len);
534 kunmap_atomic(kshadow_addr);
535 kunmap_atomic(kaddr);
536 }
537 __free_page(shadow_page);
538 bio_put(bio);
539
540 return ret;
541}
542
543static int
544bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
545 unsigned int dirty_offset, unsigned int dirty_len,
546 bool full_page)
547{
548 int ret = 0;
549 unsigned int start, end;
550
551 if (full_page) {
552 start = 0;
553 end = PAGE_CACHE_SIZE;
554 } else {
555 start = round_down(dirty_offset, SECTOR_SIZE);
556 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
557 }
558
559 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
560 if (!be) {
561 zero_user_segments(page, start, dirty_offset,
562 dirty_offset + dirty_len, end);
563 if (start == 0 && end == PAGE_CACHE_SIZE &&
564 trylock_page(page)) {
565 SetPageUptodate(page);
566 unlock_page(page);
567 }
568 return ret;
569 }
570
571 if (start != dirty_offset)
572 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
573
574 if (!ret && (dirty_offset + dirty_len < end))
575 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
576 end - dirty_offset - dirty_len);
577
578 return ret;
579}
580
581/* Given an unmapped page, zero it or read in page for COW, page is locked
582 * by caller.
583 */
584static int
585init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
586{
587 struct buffer_head *bh = NULL;
588 int ret = 0;
589 sector_t isect;
590
591 dprintk("%s enter, %p\n", __func__, page);
592 BUG_ON(PageUptodate(page));
593 if (!cow_read) {
594 zero_user_segment(page, 0, PAGE_SIZE);
595 SetPageUptodate(page);
596 goto cleanup;
597 }
598
599 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
600 if (!bh) {
601 ret = -ENOMEM;
602 goto cleanup;
603 }
604
605 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
606 map_block(bh, isect, cow_read);
607 if (!bh_uptodate_or_lock(bh))
608 ret = bh_submit_read(bh);
609 if (ret)
610 goto cleanup;
611 SetPageUptodate(page);
612
613cleanup:
614 if (bh)
615 free_buffer_head(bh);
616 if (ret) {
617 /* Need to mark layout with bad read...should now
618 * just use nfs4 for reads and writes.
619 */
620 mark_bad_read();
621 }
622 return ret;
623}
624
625/* Find or create a zeroing page marked being writeback.
626 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
627 * to indicate write out.
628 */
629static struct page *
630bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
631 struct pnfs_block_extent *cow_read)
632{
633 struct page *page;
634 int locked = 0;
635 page = find_get_page(inode->i_mapping, index);
636 if (page)
637 goto check_page;
638
639 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
640 if (unlikely(!page)) {
641 dprintk("%s oom\n", __func__);
642 return ERR_PTR(-ENOMEM);
643 }
644 locked = 1;
645
646check_page:
647 /* PageDirty: Other will write this out
648 * PageWriteback: Other is writing this out
649 * PageUptodate: It was read before
650 */
651 if (PageDirty(page) || PageWriteback(page)) {
652 print_page(page);
653 if (locked)
654 unlock_page(page);
655 page_cache_release(page);
656 return NULL;
657 }
658
659 if (!locked) {
660 lock_page(page);
661 locked = 1;
662 goto check_page;
663 }
664 if (!PageUptodate(page)) {
665 /* New page, readin or zero it */
666 init_page_for_write(page, cow_read);
667 }
668 set_page_writeback(page);
669 unlock_page(page);
670
671 return page;
672}
673
674static enum pnfs_try_status 369static enum pnfs_try_status
675bl_write_pagelist(struct nfs_pgio_header *header, int sync) 370bl_write_pagelist(struct nfs_pgio_header *header, int sync)
676{ 371{
677 int i, ret, npg_zero, pg_index, last = 0; 372 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
373 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
678 struct bio *bio = NULL; 374 struct bio *bio = NULL;
679 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 375 struct pnfs_block_extent be;
680 sector_t isect, last_isect = 0, extent_length = 0; 376 sector_t isect, extent_length = 0;
681 struct parallel_io *par = NULL; 377 struct parallel_io *par = NULL;
682 loff_t offset = header->args.offset; 378 loff_t offset = header->args.offset;
683 size_t count = header->args.count; 379 size_t count = header->args.count;
684 unsigned int pg_offset, pg_len, saved_len;
685 struct page **pages = header->args.pages; 380 struct page **pages = header->args.pages;
686 struct page *page; 381 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
687 pgoff_t index; 382 unsigned int pg_len;
688 u64 temp; 383 struct blk_plug plug;
689 int npg_per_block = 384 int i;
690 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
691 385
692 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 386 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
693 387
694 if (header->dreq != NULL &&
695 (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
696 !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
697 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
698 goto out_mds;
699 }
700 /* At this point, header->page_aray is a (sequential) list of nfs_pages. 388 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
701 * We want to write each, and if there is an error set pnfs_error 389 * We want to write each, and if there is an error set pnfs_error
702 * to have it redone using nfs. 390 * to have it redone using nfs.
703 */ 391 */
704 par = alloc_parallel(header); 392 par = alloc_parallel(header);
705 if (!par) 393 if (!par)
706 goto out_mds; 394 return PNFS_NOT_ATTEMPTED;
707 par->pnfs_callback = bl_end_par_io_write; 395 par->pnfs_callback = bl_end_par_io_write;
708 /* At this point, have to be more careful with error handling */
709 396
710 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 397 blk_start_plug(&plug);
711 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
712 if (!be || !is_writable(be, isect)) {
713 dprintk("%s no matching extents!\n", __func__);
714 goto out_mds;
715 }
716 398
717 /* First page inside INVALID extent */ 399 /* we always write out the whole page */
718 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 400 offset = offset & (loff_t)PAGE_CACHE_MASK;
719 if (likely(!bl_push_one_short_extent(be->be_inval))) 401 isect = offset >> SECTOR_SHIFT;
720 par->bse_count++;
721 else
722 goto out_mds;
723 temp = offset >> PAGE_CACHE_SHIFT;
724 npg_zero = do_div(temp, npg_per_block);
725 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
726 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
727 extent_length = be->be_length - (isect - be->be_f_offset);
728
729fill_invalid_ext:
730 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
731 for (;npg_zero > 0; npg_zero--) {
732 if (bl_is_sector_init(be->be_inval, isect)) {
733 dprintk("isect %llu already init\n",
734 (unsigned long long)isect);
735 goto next_page;
736 }
737 /* page ref released in bl_end_io_write_zero */
738 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
739 dprintk("%s zero %dth page: index %lu isect %llu\n",
740 __func__, npg_zero, index,
741 (unsigned long long)isect);
742 page = bl_find_get_zeroing_page(header->inode, index,
743 cow_read);
744 if (unlikely(IS_ERR(page))) {
745 header->pnfs_error = PTR_ERR(page);
746 goto out;
747 } else if (page == NULL)
748 goto next_page;
749
750 ret = bl_mark_sectors_init(be->be_inval, isect,
751 PAGE_CACHE_SECTORS);
752 if (unlikely(ret)) {
753 dprintk("%s bl_mark_sectors_init fail %d\n",
754 __func__, ret);
755 end_page_writeback(page);
756 page_cache_release(page);
757 header->pnfs_error = ret;
758 goto out;
759 }
760 if (likely(!bl_push_one_short_extent(be->be_inval)))
761 par->bse_count++;
762 else {
763 end_page_writeback(page);
764 page_cache_release(page);
765 header->pnfs_error = -ENOMEM;
766 goto out;
767 }
768 /* FIXME: This should be done in bi_end_io */
769 mark_extents_written(BLK_LSEG2EXT(header->lseg),
770 page->index << PAGE_CACHE_SHIFT,
771 PAGE_CACHE_SIZE);
772
773 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
774 isect, page, be,
775 bl_end_io_write_zero, par);
776 if (IS_ERR(bio)) {
777 header->pnfs_error = PTR_ERR(bio);
778 bio = NULL;
779 goto out;
780 }
781next_page:
782 isect += PAGE_CACHE_SECTORS;
783 extent_length -= PAGE_CACHE_SECTORS;
784 }
785 if (last)
786 goto write_done;
787 }
788 bio = bl_submit_bio(WRITE, bio);
789 402
790 /* Middle pages */
791 pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
792 for (i = pg_index; i < header->page_array.npages; i++) { 403 for (i = pg_index; i < header->page_array.npages; i++) {
793 if (!extent_length) { 404 if (extent_length <= 0) {
794 /* We've used up the previous extent */ 405 /* We've used up the previous extent */
795 bl_put_extent(be);
796 bl_put_extent(cow_read);
797 bio = bl_submit_bio(WRITE, bio); 406 bio = bl_submit_bio(WRITE, bio);
798 /* Get the next one */ 407 /* Get the next one */
799 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 408 if (!ext_tree_lookup(bl, isect, &be, true)) {
800 isect, &cow_read);
801 if (!be || !is_writable(be, isect)) {
802 header->pnfs_error = -EINVAL; 409 header->pnfs_error = -EINVAL;
803 goto out; 410 goto out;
804 } 411 }
805 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
806 if (likely(!bl_push_one_short_extent(
807 be->be_inval)))
808 par->bse_count++;
809 else {
810 header->pnfs_error = -ENOMEM;
811 goto out;
812 }
813 }
814 extent_length = be->be_length -
815 (isect - be->be_f_offset);
816 }
817
818 dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
819 pg_offset = offset & ~PAGE_CACHE_MASK;
820 if (pg_offset + count > PAGE_CACHE_SIZE)
821 pg_len = PAGE_CACHE_SIZE - pg_offset;
822 else
823 pg_len = count;
824
825 saved_len = pg_len;
826 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
827 !bl_is_sector_init(be->be_inval, isect)) {
828 ret = bl_read_partial_page_sync(pages[i], cow_read,
829 pg_offset, pg_len, true);
830 if (ret) {
831 dprintk("%s bl_read_partial_page_sync fail %d\n",
832 __func__, ret);
833 header->pnfs_error = ret;
834 goto out;
835 }
836
837 ret = bl_mark_sectors_init(be->be_inval, isect,
838 PAGE_CACHE_SECTORS);
839 if (unlikely(ret)) {
840 dprintk("%s bl_mark_sectors_init fail %d\n",
841 __func__, ret);
842 header->pnfs_error = ret;
843 goto out;
844 }
845 412
846 /* Expand to full page write */ 413 extent_length = be.be_length - (isect - be.be_f_offset);
847 pg_offset = 0;
848 pg_len = PAGE_CACHE_SIZE;
849 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
850 (pg_len & (SECTOR_SIZE - 1))){
851 /* ahh, nasty case. We have to do sync full sector
852 * read-modify-write cycles.
853 */
854 unsigned int saved_offset = pg_offset;
855 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
856 pg_len, false);
857 pg_offset = round_down(pg_offset, SECTOR_SIZE);
858 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
859 - pg_offset;
860 } 414 }
861 415
862 416 pg_len = PAGE_CACHE_SIZE;
863 bio = do_add_page_to_bio(bio, header->page_array.npages - i, 417 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
864 WRITE, 418 WRITE, isect, pages[i], &map, &be,
865 isect, pages[i], be,
866 bl_end_io_write, par, 419 bl_end_io_write, par,
867 pg_offset, pg_len); 420 0, &pg_len);
868 if (IS_ERR(bio)) { 421 if (IS_ERR(bio)) {
869 header->pnfs_error = PTR_ERR(bio); 422 header->pnfs_error = PTR_ERR(bio);
870 bio = NULL; 423 bio = NULL;
871 goto out; 424 goto out;
872 } 425 }
873 offset += saved_len;
874 count -= saved_len;
875 isect += PAGE_CACHE_SECTORS;
876 last_isect = isect;
877 extent_length -= PAGE_CACHE_SECTORS;
878 }
879 426
880 /* Last page inside INVALID extent */ 427 offset += pg_len;
881 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 428 count -= pg_len;
882 bio = bl_submit_bio(WRITE, bio); 429 isect += (pg_len >> SECTOR_SHIFT);
883 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; 430 extent_length -= (pg_len >> SECTOR_SHIFT);
884 npg_zero = npg_per_block - do_div(temp, npg_per_block);
885 if (npg_zero < npg_per_block) {
886 last = 1;
887 goto fill_invalid_ext;
888 }
889 } 431 }
890 432
891write_done:
892 header->res.count = header->args.count; 433 header->res.count = header->args.count;
893out: 434out:
894 bl_put_extent(be);
895 bl_put_extent(cow_read);
896 bl_submit_bio(WRITE, bio); 435 bl_submit_bio(WRITE, bio);
436 blk_finish_plug(&plug);
897 put_parallel(par); 437 put_parallel(par);
898 return PNFS_ATTEMPTED; 438 return PNFS_ATTEMPTED;
899out_mds:
900 bl_put_extent(be);
901 bl_put_extent(cow_read);
902 kfree(par);
903 return PNFS_NOT_ATTEMPTED;
904}
905
906/* FIXME - range ignored */
907static void
908release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
909{
910 int i;
911 struct pnfs_block_extent *be;
912
913 spin_lock(&bl->bl_ext_lock);
914 for (i = 0; i < EXTENT_LISTS; i++) {
915 while (!list_empty(&bl->bl_extents[i])) {
916 be = list_first_entry(&bl->bl_extents[i],
917 struct pnfs_block_extent,
918 be_node);
919 list_del(&be->be_node);
920 bl_put_extent(be);
921 }
922 }
923 spin_unlock(&bl->bl_ext_lock);
924}
925
926static void
927release_inval_marks(struct pnfs_inval_markings *marks)
928{
929 struct pnfs_inval_tracking *pos, *temp;
930 struct pnfs_block_short_extent *se, *stemp;
931
932 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
933 list_del(&pos->it_link);
934 kfree(pos);
935 }
936
937 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
938 list_del(&se->bse_node);
939 kfree(se);
940 }
941 return;
942} 439}
943 440
944static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 441static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
945{ 442{
946 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 443 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
444 int err;
947 445
948 dprintk("%s enter\n", __func__); 446 dprintk("%s enter\n", __func__);
949 release_extents(bl, NULL); 447
950 release_inval_marks(&bl->bl_inval); 448 err = ext_tree_remove(bl, true, 0, LLONG_MAX);
449 WARN_ON(err);
450
951 kfree(bl); 451 kfree(bl);
952} 452}
953 453
@@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
960 bl = kzalloc(sizeof(*bl), gfp_flags); 460 bl = kzalloc(sizeof(*bl), gfp_flags);
961 if (!bl) 461 if (!bl)
962 return NULL; 462 return NULL;
463
464 bl->bl_ext_rw = RB_ROOT;
465 bl->bl_ext_ro = RB_ROOT;
963 spin_lock_init(&bl->bl_ext_lock); 466 spin_lock_init(&bl->bl_ext_lock);
964 INIT_LIST_HEAD(&bl->bl_extents[0]); 467
965 INIT_LIST_HEAD(&bl->bl_extents[1]);
966 INIT_LIST_HEAD(&bl->bl_commit);
967 INIT_LIST_HEAD(&bl->bl_committing);
968 bl->bl_count = 0;
969 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
970 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
971 return &bl->bl_layout; 468 return &bl->bl_layout;
972} 469}
973 470
@@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
977 kfree(lseg); 474 kfree(lseg);
978} 475}
979 476
980/* We pretty much ignore lseg, and store all data layout wide, so we 477/* Tracks info needed to ensure extents in layout obey constraints of spec */
981 * can correctly merge. 478struct layout_verification {
982 */ 479 u32 mode; /* R or RW */
983static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, 480 u64 start; /* Expected start of next non-COW extent */
984 struct nfs4_layoutget_res *lgr, 481 u64 inval; /* Start of INVAL coverage */
985 gfp_t gfp_flags) 482 u64 cowread; /* End of COW read coverage */
986{ 483};
987 struct pnfs_layout_segment *lseg;
988 int status;
989 484
990 dprintk("%s enter\n", __func__); 485/* Verify the extent meets the layout requirements of the pnfs-block draft,
991 lseg = kzalloc(sizeof(*lseg), gfp_flags); 486 * section 2.3.1.
992 if (!lseg) 487 */
993 return ERR_PTR(-ENOMEM); 488static int verify_extent(struct pnfs_block_extent *be,
994 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); 489 struct layout_verification *lv)
995 if (status) { 490{
996 /* We don't want to call the full-blown bl_free_lseg, 491 if (lv->mode == IOMODE_READ) {
997 * since on error extents were not touched. 492 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
998 */ 493 be->be_state == PNFS_BLOCK_INVALID_DATA)
999 kfree(lseg); 494 return -EIO;
1000 return ERR_PTR(status); 495 if (be->be_f_offset != lv->start)
496 return -EIO;
497 lv->start += be->be_length;
498 return 0;
1001 } 499 }
1002 return lseg; 500 /* lv->mode == IOMODE_RW */
501 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
502 if (be->be_f_offset != lv->start)
503 return -EIO;
504 if (lv->cowread > lv->start)
505 return -EIO;
506 lv->start += be->be_length;
507 lv->inval = lv->start;
508 return 0;
509 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
510 if (be->be_f_offset != lv->start)
511 return -EIO;
512 lv->start += be->be_length;
513 return 0;
514 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
515 if (be->be_f_offset > lv->start)
516 return -EIO;
517 if (be->be_f_offset < lv->inval)
518 return -EIO;
519 if (be->be_f_offset < lv->cowread)
520 return -EIO;
521 /* It looks like you might want to min this with lv->start,
522 * but you really don't.
523 */
524 lv->inval = lv->inval + be->be_length;
525 lv->cowread = be->be_f_offset + be->be_length;
526 return 0;
527 } else
528 return -EIO;
1003} 529}
1004 530
1005static void 531static int decode_sector_number(__be32 **rp, sector_t *sp)
1006bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
1007 const struct nfs4_layoutcommit_args *arg)
1008{ 532{
1009 dprintk("%s enter\n", __func__); 533 uint64_t s;
1010 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 534
535 *rp = xdr_decode_hyper(*rp, &s);
536 if (s & 0x1ff) {
537 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
538 return -1;
539 }
540 *sp = s >> SECTOR_SHIFT;
541 return 0;
1011} 542}
1012 543
1013static void 544static int
1014bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) 545bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
546 struct layout_verification *lv, struct list_head *extents,
547 gfp_t gfp_mask)
1015{ 548{
1016 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 549 struct pnfs_block_extent *be;
550 struct nfs4_deviceid id;
551 int error;
552 __be32 *p;
1017 553
1018 dprintk("%s enter\n", __func__); 554 p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
1019 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 555 if (!p)
1020} 556 return -EIO;
1021 557
1022static void free_blk_mountid(struct block_mount_id *mid) 558 be = kzalloc(sizeof(*be), GFP_NOFS);
1023{ 559 if (!be)
1024 if (mid) { 560 return -ENOMEM;
1025 struct pnfs_block_dev *dev, *tmp;
1026 561
1027 /* No need to take bm_lock as we are last user freeing bm_devlist */ 562 memcpy(&id, p, NFS4_DEVICEID4_SIZE);
1028 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { 563 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
1029 list_del(&dev->bm_node); 564
1030 bl_free_block_dev(dev); 565 error = -EIO;
1031 } 566 be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
1032 kfree(mid); 567 lo->plh_lc_cred, gfp_mask);
568 if (!be->be_device)
569 goto out_free_be;
570
571 /*
572 * The next three values are read in as bytes, but stored in the
573 * extent structure in 512-byte granularity.
574 */
575 if (decode_sector_number(&p, &be->be_f_offset) < 0)
576 goto out_put_deviceid;
577 if (decode_sector_number(&p, &be->be_length) < 0)
578 goto out_put_deviceid;
579 if (decode_sector_number(&p, &be->be_v_offset) < 0)
580 goto out_put_deviceid;
581 be->be_state = be32_to_cpup(p++);
582
583 error = verify_extent(be, lv);
584 if (error) {
585 dprintk("%s: extent verification failed\n", __func__);
586 goto out_put_deviceid;
1033 } 587 }
588
589 list_add_tail(&be->be_list, extents);
590 return 0;
591
592out_put_deviceid:
593 nfs4_put_deviceid_node(be->be_device);
594out_free_be:
595 kfree(be);
596 return error;
1034} 597}
1035 598
1036/* This is mostly copied from the filelayout_get_device_info function. 599static struct pnfs_layout_segment *
1037 * It seems much of this should be at the generic pnfs level. 600bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
1038 */ 601 gfp_t gfp_mask)
1039static struct pnfs_block_dev *
1040nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1041 struct nfs4_deviceid *d_id)
1042{ 602{
1043 struct pnfs_device *dev; 603 struct layout_verification lv = {
1044 struct pnfs_block_dev *rv; 604 .mode = lgr->range.iomode,
1045 u32 max_resp_sz; 605 .start = lgr->range.offset >> SECTOR_SHIFT,
1046 int max_pages; 606 .inval = lgr->range.offset >> SECTOR_SHIFT,
1047 struct page **pages = NULL; 607 .cowread = lgr->range.offset >> SECTOR_SHIFT,
1048 int i, rc; 608 };
609 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
610 struct pnfs_layout_segment *lseg;
611 struct xdr_buf buf;
612 struct xdr_stream xdr;
613 struct page *scratch;
614 int status, i;
615 uint32_t count;
616 __be32 *p;
617 LIST_HEAD(extents);
618
619 dprintk("---> %s\n", __func__);
620
621 lseg = kzalloc(sizeof(*lseg), gfp_mask);
622 if (!lseg)
623 return ERR_PTR(-ENOMEM);
624
625 status = -ENOMEM;
626 scratch = alloc_page(gfp_mask);
627 if (!scratch)
628 goto out;
629
630 xdr_init_decode_pages(&xdr, &buf,
631 lgr->layoutp->pages, lgr->layoutp->len);
632 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
633
634 status = -EIO;
635 p = xdr_inline_decode(&xdr, 4);
636 if (unlikely(!p))
637 goto out_free_scratch;
638
639 count = be32_to_cpup(p++);
640 dprintk("%s: number of extents %d\n", __func__, count);
1049 641
1050 /* 642 /*
1051 * Use the session max response size as the basis for setting 643 * Decode individual extents, putting them in temporary staging area
1052 * GETDEVICEINFO's maxcount 644 * until whole layout is decoded to make error recovery easier.
1053 */ 645 */
1054 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 646 for (i = 0; i < count; i++) {
1055 max_pages = nfs_page_array_len(0, max_resp_sz); 647 status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
1056 dprintk("%s max_resp_sz %u max_pages %d\n", 648 if (status)
1057 __func__, max_resp_sz, max_pages); 649 goto process_extents;
1058
1059 dev = kmalloc(sizeof(*dev), GFP_NOFS);
1060 if (!dev) {
1061 dprintk("%s kmalloc failed\n", __func__);
1062 return ERR_PTR(-ENOMEM);
1063 } 650 }
1064 651
1065 pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS); 652 if (lgr->range.offset + lgr->range.length !=
1066 if (pages == NULL) { 653 lv.start << SECTOR_SHIFT) {
1067 kfree(dev); 654 dprintk("%s Final length mismatch\n", __func__);
1068 return ERR_PTR(-ENOMEM); 655 status = -EIO;
656 goto process_extents;
1069 } 657 }
1070 for (i = 0; i < max_pages; i++) { 658
1071 pages[i] = alloc_page(GFP_NOFS); 659 if (lv.start < lv.cowread) {
1072 if (!pages[i]) { 660 dprintk("%s Final uncovered COW extent\n", __func__);
1073 rv = ERR_PTR(-ENOMEM); 661 status = -EIO;
1074 goto out_free;
1075 }
1076 } 662 }
1077 663
1078 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 664process_extents:
1079 dev->layout_type = LAYOUT_BLOCK_VOLUME; 665 while (!list_empty(&extents)) {
1080 dev->pages = pages; 666 struct pnfs_block_extent *be =
1081 dev->pgbase = 0; 667 list_first_entry(&extents, struct pnfs_block_extent,
1082 dev->pglen = PAGE_SIZE * max_pages; 668 be_list);
1083 dev->mincount = 0; 669 list_del(&be->be_list);
1084 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; 670
1085 671 if (!status)
1086 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 672 status = ext_tree_insert(bl, be);
1087 rc = nfs4_proc_getdeviceinfo(server, dev, NULL); 673
1088 dprintk("%s getdevice info returns %d\n", __func__, rc); 674 if (status) {
1089 if (rc) { 675 nfs4_put_deviceid_node(be->be_device);
1090 rv = ERR_PTR(rc); 676 kfree(be);
1091 goto out_free; 677 }
1092 } 678 }
1093 679
1094 rv = nfs4_blk_decode_device(server, dev); 680out_free_scratch:
1095 out_free: 681 __free_page(scratch);
1096 for (i = 0; i < max_pages; i++) 682out:
1097 __free_page(pages[i]); 683 dprintk("%s returns %d\n", __func__, status);
1098 kfree(pages); 684 if (status) {
1099 kfree(dev); 685 kfree(lseg);
1100 return rv; 686 return ERR_PTR(status);
687 }
688 return lseg;
1101} 689}
1102 690
1103static int 691static void
1104bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) 692bl_return_range(struct pnfs_layout_hdr *lo,
693 struct pnfs_layout_range *range)
1105{ 694{
1106 struct block_mount_id *b_mt_id = NULL; 695 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
1107 struct pnfs_devicelist *dlist = NULL; 696 sector_t offset = range->offset >> SECTOR_SHIFT, end;
1108 struct pnfs_block_dev *bdev;
1109 LIST_HEAD(block_disklist);
1110 int status, i;
1111
1112 dprintk("%s enter\n", __func__);
1113 697
1114 if (server->pnfs_blksize == 0) { 698 if (range->offset % 8) {
1115 dprintk("%s Server did not return blksize\n", __func__); 699 dprintk("%s: offset %lld not block size aligned\n",
1116 return -EINVAL; 700 __func__, range->offset);
1117 } 701 return;
1118 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
1119 if (!b_mt_id) {
1120 status = -ENOMEM;
1121 goto out_error;
1122 }
1123 /* Initialize nfs4 block layout mount id */
1124 spin_lock_init(&b_mt_id->bm_lock);
1125 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
1126
1127 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
1128 if (!dlist) {
1129 status = -ENOMEM;
1130 goto out_error;
1131 } 702 }
1132 dlist->eof = 0; 703
1133 while (!dlist->eof) { 704 if (range->length != NFS4_MAX_UINT64) {
1134 status = nfs4_proc_getdevicelist(server, fh, dlist); 705 if (range->length % 8) {
1135 if (status) 706 dprintk("%s: length %lld not block size aligned\n",
1136 goto out_error; 707 __func__, range->length);
1137 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", 708 return;
1138 __func__, dlist->num_devs, dlist->eof);
1139 for (i = 0; i < dlist->num_devs; i++) {
1140 bdev = nfs4_blk_get_deviceinfo(server, fh,
1141 &dlist->dev_id[i]);
1142 if (IS_ERR(bdev)) {
1143 status = PTR_ERR(bdev);
1144 goto out_error;
1145 }
1146 spin_lock(&b_mt_id->bm_lock);
1147 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
1148 spin_unlock(&b_mt_id->bm_lock);
1149 } 709 }
1150 }
1151 dprintk("%s SUCCESS\n", __func__);
1152 server->pnfs_ld_data = b_mt_id;
1153 710
1154 out_return: 711 end = offset + (range->length >> SECTOR_SHIFT);
1155 kfree(dlist); 712 } else {
1156 return status; 713 end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
714 }
1157 715
1158 out_error: 716 ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
1159 free_blk_mountid(b_mt_id);
1160 goto out_return;
1161} 717}
1162 718
1163static int 719static int
1164bl_clear_layoutdriver(struct nfs_server *server) 720bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
721{
722 return ext_tree_prepare_commit(arg);
723}
724
725static void
726bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
1165{ 727{
1166 struct block_mount_id *b_mt_id = server->pnfs_ld_data; 728 ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
729}
1167 730
731static int
732bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
733{
1168 dprintk("%s enter\n", __func__); 734 dprintk("%s enter\n", __func__);
1169 free_blk_mountid(b_mt_id); 735
1170 dprintk("%s RETURNS\n", __func__); 736 if (server->pnfs_blksize == 0) {
737 dprintk("%s Server did not return blksize\n", __func__);
738 return -EINVAL;
739 }
740 if (server->pnfs_blksize > PAGE_SIZE) {
741 printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
742 __func__, server->pnfs_blksize);
743 return -EINVAL;
744 }
745
1171 return 0; 746 return 0;
1172} 747}
1173 748
1174static bool 749static bool
1175is_aligned_req(struct nfs_page *req, unsigned int alignment) 750is_aligned_req(struct nfs_pageio_descriptor *pgio,
751 struct nfs_page *req, unsigned int alignment)
1176{ 752{
1177 return IS_ALIGNED(req->wb_offset, alignment) && 753 /*
1178 IS_ALIGNED(req->wb_bytes, alignment); 754 * Always accept buffered writes, higher layers take care of the
755 * right alignment.
756 */
757 if (pgio->pg_dreq == NULL)
758 return true;
759
760 if (!IS_ALIGNED(req->wb_offset, alignment))
761 return false;
762
763 if (IS_ALIGNED(req->wb_bytes, alignment))
764 return true;
765
766 if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
767 /*
768 * If the write goes up to the inode size, just write
769 * the full page. Data past the inode size is
770 * guaranteed to be zeroed by the higher level client
771 * code, and this behaviour is mandated by RFC 5663
772 * section 2.3.2.
773 */
774 return true;
775 }
776
777 return false;
1179} 778}
1180 779
1181static void 780static void
1182bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 781bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1183{ 782{
1184 if (pgio->pg_dreq != NULL && 783 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
1185 !is_aligned_req(req, SECTOR_SIZE))
1186 nfs_pageio_reset_read_mds(pgio); 784 nfs_pageio_reset_read_mds(pgio);
1187 else 785 return;
1188 pnfs_generic_pg_init_read(pgio, req); 786 }
787
788 pnfs_generic_pg_init_read(pgio, req);
1189} 789}
1190 790
1191/* 791/*
@@ -1196,10 +796,8 @@ static size_t
1196bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 796bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1197 struct nfs_page *req) 797 struct nfs_page *req)
1198{ 798{
1199 if (pgio->pg_dreq != NULL && 799 if (!is_aligned_req(pgio, req, SECTOR_SIZE))
1200 !is_aligned_req(req, SECTOR_SIZE))
1201 return 0; 800 return 0;
1202
1203 return pnfs_generic_pg_test(pgio, prev, req); 801 return pnfs_generic_pg_test(pgio, prev, req);
1204} 802}
1205 803
@@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1229static void 827static void
1230bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 828bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1231{ 829{
1232 if (pgio->pg_dreq != NULL && 830 u64 wb_size;
1233 !is_aligned_req(req, PAGE_CACHE_SIZE)) { 831
832 if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
1234 nfs_pageio_reset_write_mds(pgio); 833 nfs_pageio_reset_write_mds(pgio);
1235 } else { 834 return;
1236 u64 wb_size;
1237 if (pgio->pg_dreq == NULL)
1238 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
1239 req->wb_index);
1240 else
1241 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1242
1243 pnfs_generic_pg_init_write(pgio, req, wb_size);
1244 } 835 }
836
837 if (pgio->pg_dreq == NULL)
838 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
839 req->wb_index);
840 else
841 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
842
843 pnfs_generic_pg_init_write(pgio, req, wb_size);
1245} 844}
1246 845
1247/* 846/*
@@ -1252,10 +851,8 @@ static size_t
1252bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 851bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1253 struct nfs_page *req) 852 struct nfs_page *req)
1254{ 853{
1255 if (pgio->pg_dreq != NULL && 854 if (!is_aligned_req(pgio, req, PAGE_SIZE))
1256 !is_aligned_req(req, PAGE_CACHE_SIZE))
1257 return 0; 855 return 0;
1258
1259 return pnfs_generic_pg_test(pgio, prev, req); 856 return pnfs_generic_pg_test(pgio, prev, req);
1260} 857}
1261 858
@@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
1275 .id = LAYOUT_BLOCK_VOLUME, 872 .id = LAYOUT_BLOCK_VOLUME,
1276 .name = "LAYOUT_BLOCK_VOLUME", 873 .name = "LAYOUT_BLOCK_VOLUME",
1277 .owner = THIS_MODULE, 874 .owner = THIS_MODULE,
875 .flags = PNFS_LAYOUTRET_ON_SETATTR |
876 PNFS_READ_WHOLE_PAGE,
1278 .read_pagelist = bl_read_pagelist, 877 .read_pagelist = bl_read_pagelist,
1279 .write_pagelist = bl_write_pagelist, 878 .write_pagelist = bl_write_pagelist,
1280 .alloc_layout_hdr = bl_alloc_layout_hdr, 879 .alloc_layout_hdr = bl_alloc_layout_hdr,
1281 .free_layout_hdr = bl_free_layout_hdr, 880 .free_layout_hdr = bl_free_layout_hdr,
1282 .alloc_lseg = bl_alloc_lseg, 881 .alloc_lseg = bl_alloc_lseg,
1283 .free_lseg = bl_free_lseg, 882 .free_lseg = bl_free_lseg,
1284 .encode_layoutcommit = bl_encode_layoutcommit, 883 .return_range = bl_return_range,
884 .prepare_layoutcommit = bl_prepare_layoutcommit,
1285 .cleanup_layoutcommit = bl_cleanup_layoutcommit, 885 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
1286 .set_layoutdriver = bl_set_layoutdriver, 886 .set_layoutdriver = bl_set_layoutdriver,
1287 .clear_layoutdriver = bl_clear_layoutdriver, 887 .alloc_deviceid_node = bl_alloc_deviceid_node,
888 .free_deviceid_node = bl_free_deviceid_node,
1288 .pg_read_ops = &bl_pg_read_ops, 889 .pg_read_ops = &bl_pg_read_ops,
1289 .pg_write_ops = &bl_pg_write_ops, 890 .pg_write_ops = &bl_pg_write_ops,
1290}; 891};
1291 892
1292static const struct rpc_pipe_ops bl_upcall_ops = {
1293 .upcall = rpc_pipe_generic_upcall,
1294 .downcall = bl_pipe_downcall,
1295 .destroy_msg = bl_pipe_destroy_msg,
1296};
1297
1298static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
1299 struct rpc_pipe *pipe)
1300{
1301 struct dentry *dir, *dentry;
1302
1303 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
1304 if (dir == NULL)
1305 return ERR_PTR(-ENOENT);
1306 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
1307 dput(dir);
1308 return dentry;
1309}
1310
1311static void nfs4blocklayout_unregister_sb(struct super_block *sb,
1312 struct rpc_pipe *pipe)
1313{
1314 if (pipe->dentry)
1315 rpc_unlink(pipe->dentry);
1316}
1317
1318static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
1319 void *ptr)
1320{
1321 struct super_block *sb = ptr;
1322 struct net *net = sb->s_fs_info;
1323 struct nfs_net *nn = net_generic(net, nfs_net_id);
1324 struct dentry *dentry;
1325 int ret = 0;
1326
1327 if (!try_module_get(THIS_MODULE))
1328 return 0;
1329
1330 if (nn->bl_device_pipe == NULL) {
1331 module_put(THIS_MODULE);
1332 return 0;
1333 }
1334
1335 switch (event) {
1336 case RPC_PIPEFS_MOUNT:
1337 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
1338 if (IS_ERR(dentry)) {
1339 ret = PTR_ERR(dentry);
1340 break;
1341 }
1342 nn->bl_device_pipe->dentry = dentry;
1343 break;
1344 case RPC_PIPEFS_UMOUNT:
1345 if (nn->bl_device_pipe->dentry)
1346 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
1347 break;
1348 default:
1349 ret = -ENOTSUPP;
1350 break;
1351 }
1352 module_put(THIS_MODULE);
1353 return ret;
1354}
1355
1356static struct notifier_block nfs4blocklayout_block = {
1357 .notifier_call = rpc_pipefs_event,
1358};
1359
1360static struct dentry *nfs4blocklayout_register_net(struct net *net,
1361 struct rpc_pipe *pipe)
1362{
1363 struct super_block *pipefs_sb;
1364 struct dentry *dentry;
1365
1366 pipefs_sb = rpc_get_sb_net(net);
1367 if (!pipefs_sb)
1368 return NULL;
1369 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1370 rpc_put_sb_net(net);
1371 return dentry;
1372}
1373
1374static void nfs4blocklayout_unregister_net(struct net *net,
1375 struct rpc_pipe *pipe)
1376{
1377 struct super_block *pipefs_sb;
1378
1379 pipefs_sb = rpc_get_sb_net(net);
1380 if (pipefs_sb) {
1381 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1382 rpc_put_sb_net(net);
1383 }
1384}
1385
1386static int nfs4blocklayout_net_init(struct net *net)
1387{
1388 struct nfs_net *nn = net_generic(net, nfs_net_id);
1389 struct dentry *dentry;
1390
1391 init_waitqueue_head(&nn->bl_wq);
1392 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1393 if (IS_ERR(nn->bl_device_pipe))
1394 return PTR_ERR(nn->bl_device_pipe);
1395 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1396 if (IS_ERR(dentry)) {
1397 rpc_destroy_pipe_data(nn->bl_device_pipe);
1398 return PTR_ERR(dentry);
1399 }
1400 nn->bl_device_pipe->dentry = dentry;
1401 return 0;
1402}
1403
1404static void nfs4blocklayout_net_exit(struct net *net)
1405{
1406 struct nfs_net *nn = net_generic(net, nfs_net_id);
1407
1408 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1409 rpc_destroy_pipe_data(nn->bl_device_pipe);
1410 nn->bl_device_pipe = NULL;
1411}
1412
1413static struct pernet_operations nfs4blocklayout_net_ops = {
1414 .init = nfs4blocklayout_net_init,
1415 .exit = nfs4blocklayout_net_exit,
1416};
1417
1418static int __init nfs4blocklayout_init(void) 893static int __init nfs4blocklayout_init(void)
1419{ 894{
1420 int ret; 895 int ret;
@@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
1424 ret = pnfs_register_layoutdriver(&blocklayout_type); 899 ret = pnfs_register_layoutdriver(&blocklayout_type);
1425 if (ret) 900 if (ret)
1426 goto out; 901 goto out;
1427 902 ret = bl_init_pipefs();
1428 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
1429 if (ret) 903 if (ret)
1430 goto out_remove; 904 goto out_unregister;
1431 ret = register_pernet_subsys(&nfs4blocklayout_net_ops); 905 return 0;
1432 if (ret)
1433 goto out_notifier;
1434out:
1435 return ret;
1436 906
1437out_notifier: 907out_unregister:
1438 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1439out_remove:
1440 pnfs_unregister_layoutdriver(&blocklayout_type); 908 pnfs_unregister_layoutdriver(&blocklayout_type);
909out:
1441 return ret; 910 return ret;
1442} 911}
1443 912
@@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
1446 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 915 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1447 __func__); 916 __func__);
1448 917
1449 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 918 bl_cleanup_pipefs();
1450 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
1451 pnfs_unregister_layoutdriver(&blocklayout_type); 919 pnfs_unregister_layoutdriver(&blocklayout_type);
1452} 920}
1453 921
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
45#define SECTOR_SIZE (1 << SECTOR_SHIFT) 45#define SECTOR_SIZE (1 << SECTOR_SHIFT)
46 46
47struct block_mount_id { 47struct pnfs_block_dev;
48 spinlock_t bm_lock; /* protects list */
49 struct list_head bm_devlist; /* holds pnfs_block_dev */
50};
51 48
52struct pnfs_block_dev { 49enum pnfs_block_volume_type {
53 struct list_head bm_node; 50 PNFS_BLOCK_VOLUME_SIMPLE = 0,
54 struct nfs4_deviceid bm_mdevid; /* associated devid */ 51 PNFS_BLOCK_VOLUME_SLICE = 1,
55 struct block_device *bm_mdev; /* meta device itself */ 52 PNFS_BLOCK_VOLUME_CONCAT = 2,
56 struct net *net; 53 PNFS_BLOCK_VOLUME_STRIPE = 3,
57}; 54};
58 55
59enum exstate4 { 56#define PNFS_BLOCK_MAX_UUIDS 4
60 PNFS_BLOCK_READWRITE_DATA = 0, 57#define PNFS_BLOCK_MAX_DEVICES 64
61 PNFS_BLOCK_READ_DATA = 1, 58
62 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ 59/*
63 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ 60 * Random upper cap for the uuid length to avoid unbounded allocation.
61 * Not actually limited by the protocol.
62 */
63#define PNFS_BLOCK_UUID_LEN 128
64
65
66struct pnfs_block_volume {
67 enum pnfs_block_volume_type type;
68 union {
69 struct {
70 int len;
71 int nr_sigs;
72 struct {
73 u64 offset;
74 u32 sig_len;
75 u8 sig[PNFS_BLOCK_UUID_LEN];
76 } sigs[PNFS_BLOCK_MAX_UUIDS];
77 } simple;
78 struct {
79 u64 start;
80 u64 len;
81 u32 volume;
82 } slice;
83 struct {
84 u32 volumes_count;
85 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
86 } concat;
87 struct {
88 u64 chunk_size;
89 u32 volumes_count;
90 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
91 } stripe;
92 };
64}; 93};
65 94
66#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ 95struct pnfs_block_dev_map {
96 sector_t start;
97 sector_t len;
67 98
68struct my_tree { 99 sector_t disk_offset;
69 sector_t mtt_step_size; /* Internal sector alignment */ 100 struct block_device *bdev;
70 struct list_head mtt_stub; /* Should be a radix tree */
71}; 101};
72 102
73struct pnfs_inval_markings { 103struct pnfs_block_dev {
74 spinlock_t im_lock; 104 struct nfs4_deviceid_node node;
75 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 105
76 sector_t im_block_size; /* Server blocksize in sectors */ 106 u64 start;
77 struct list_head im_extents; /* Short extents for INVAL->RW conversion */ 107 u64 len;
108
109 u32 nr_children;
110 struct pnfs_block_dev *children;
111 u64 chunk_size;
112
113 struct block_device *bdev;
114 u64 disk_offset;
115
116 bool (*map)(struct pnfs_block_dev *dev, u64 offset,
117 struct pnfs_block_dev_map *map);
78}; 118};
79 119
80struct pnfs_inval_tracking { 120enum exstate4 {
81 struct list_head it_link; 121 PNFS_BLOCK_READWRITE_DATA = 0,
82 int it_sector; 122 PNFS_BLOCK_READ_DATA = 1,
83 int it_tags; 123 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
124 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
84}; 125};
85 126
86/* sector_t fields are all in 512-byte sectors */ 127/* sector_t fields are all in 512-byte sectors */
87struct pnfs_block_extent { 128struct pnfs_block_extent {
88 struct kref be_refcnt; 129 union {
89 struct list_head be_node; /* link into lseg list */ 130 struct rb_node be_node;
90 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ 131 struct list_head be_list;
91 struct block_device *be_mdev; 132 };
133 struct nfs4_deviceid_node *be_device;
92 sector_t be_f_offset; /* the starting offset in the file */ 134 sector_t be_f_offset; /* the starting offset in the file */
93 sector_t be_length; /* the size of the extent */ 135 sector_t be_length; /* the size of the extent */
94 sector_t be_v_offset; /* the starting offset in the volume */ 136 sector_t be_v_offset; /* the starting offset in the volume */
95 enum exstate4 be_state; /* the state of this extent */ 137 enum exstate4 be_state; /* the state of this extent */
96 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ 138#define EXTENT_WRITTEN 1
139#define EXTENT_COMMITTING 2
140 unsigned int be_tag;
97}; 141};
98 142
99/* Shortened extent used by LAYOUTCOMMIT */ 143/* on the wire size of the extent */
100struct pnfs_block_short_extent { 144#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
101 struct list_head bse_node;
102 struct nfs4_deviceid bse_devid;
103 struct block_device *bse_mdev;
104 sector_t bse_f_offset; /* the starting offset in the file */
105 sector_t bse_length; /* the size of the extent */
106};
107
108static inline void
109BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
110{
111 spin_lock_init(&marks->im_lock);
112 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
113 INIT_LIST_HEAD(&marks->im_extents);
114 marks->im_block_size = blocksize;
115 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
116 blocksize);
117}
118
119enum extentclass4 {
120 RW_EXTENT = 0, /* READWRTE and INVAL */
121 RO_EXTENT = 1, /* READ and NONE */
122 EXTENT_LISTS = 2,
123};
124
125static inline int bl_choose_list(enum exstate4 state)
126{
127 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
128 return RO_EXTENT;
129 else
130 return RW_EXTENT;
131}
132 145
133struct pnfs_block_layout { 146struct pnfs_block_layout {
134 struct pnfs_layout_hdr bl_layout; 147 struct pnfs_layout_hdr bl_layout;
135 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ 148 struct rb_root bl_ext_rw;
149 struct rb_root bl_ext_ro;
136 spinlock_t bl_ext_lock; /* Protects list manipulation */ 150 spinlock_t bl_ext_lock; /* Protects list manipulation */
137 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
138 struct list_head bl_commit; /* Needs layout commit */
139 struct list_head bl_committing; /* Layout committing */
140 unsigned int bl_count; /* entries in bl_commit */
141 sector_t bl_blocksize; /* Server blocksize in sectors */
142}; 151};
143 152
144#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
145
146static inline struct pnfs_block_layout * 153static inline struct pnfs_block_layout *
147BLK_LO2EXT(struct pnfs_layout_hdr *lo) 154BLK_LO2EXT(struct pnfs_layout_hdr *lo)
148{ 155{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
171#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ 178#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
172#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 179#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
173 180
174/* blocklayoutdev.c */ 181/* dev.c */
175ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 182struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
176void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 183 struct pnfs_device *pdev, gfp_t gfp_mask);
177void nfs4_blkdev_put(struct block_device *bdev); 184void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
178struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 185
179 struct pnfs_device *dev); 186/* extent_tree.c */
180int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 187int ext_tree_insert(struct pnfs_block_layout *bl,
181 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 188 struct pnfs_block_extent *new);
182 189int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
183/* blocklayoutdm.c */ 190 sector_t end);
184void bl_free_block_dev(struct pnfs_block_dev *bdev); 191int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
185 192 sector_t len);
186/* extents.c */ 193bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
187struct pnfs_block_extent * 194 struct pnfs_block_extent *ret, bool rw);
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 195int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
189 struct pnfs_block_extent **cow_read); 196void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 197
191 sector_t offset, sector_t length); 198/* rpc_pipefs.c */
192void bl_put_extent(struct pnfs_block_extent *be); 199dev_t bl_resolve_deviceid(struct nfs_server *server,
193struct pnfs_block_extent *bl_alloc_extent(void); 200 struct pnfs_block_volume *b, gfp_t gfp_mask);
194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 201int __init bl_init_pipefs(void);
195int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 202void __exit bl_cleanup_pipefs(void);
196 struct xdr_stream *xdr,
197 const struct nfs4_layoutcommit_args *arg);
198void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
199 const struct nfs4_layoutcommit_args *arg,
200 int status);
201int bl_add_merge_extent(struct pnfs_block_layout *bl,
202 struct pnfs_block_extent *new);
203int bl_mark_for_commit(struct pnfs_block_extent *be,
204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
210 203
211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 204#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/*
57 * Release the block device
58 */
59void nfs4_blkdev_put(struct block_device *bdev)
60{
61 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
62 MINOR(bdev->bd_dev));
63 blkdev_put(bdev, FMODE_READ);
64}
65
66ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
67 size_t mlen)
68{
69 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
70 nfs_net_id);
71
72 if (mlen != sizeof (struct bl_dev_msg))
73 return -EINVAL;
74
75 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
76 return -EFAULT;
77
78 wake_up(&nn->bl_wq);
79
80 return mlen;
81}
82
83void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
84{
85 struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
86
87 if (msg->errno >= 0)
88 return;
89 wake_up(bl_pipe_msg->bl_wq);
90}
91
92/*
93 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
94 */
95struct pnfs_block_dev *
96nfs4_blk_decode_device(struct nfs_server *server,
97 struct pnfs_device *dev)
98{
99 struct pnfs_block_dev *rv;
100 struct block_device *bd = NULL;
101 struct bl_pipe_msg bl_pipe_msg;
102 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
103 struct bl_msg_hdr bl_msg = {
104 .type = BL_DEVICE_MOUNT,
105 .totallen = dev->mincount,
106 };
107 uint8_t *dataptr;
108 DECLARE_WAITQUEUE(wq, current);
109 int offset, len, i, rc;
110 struct net *net = server->nfs_client->cl_net;
111 struct nfs_net *nn = net_generic(net, nfs_net_id);
112 struct bl_dev_msg *reply = &nn->bl_mount_reply;
113
114 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
115 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
116 dev->mincount);
117
118 bl_pipe_msg.bl_wq = &nn->bl_wq;
119 memset(msg, 0, sizeof(*msg));
120 msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
121 if (!msg->data) {
122 rv = ERR_PTR(-ENOMEM);
123 goto out;
124 }
125
126 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
127 dataptr = (uint8_t *) msg->data;
128 len = dev->mincount;
129 offset = sizeof(bl_msg);
130 for (i = 0; len > 0; i++) {
131 memcpy(&dataptr[offset], page_address(dev->pages[i]),
132 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
133 len -= PAGE_CACHE_SIZE;
134 offset += PAGE_CACHE_SIZE;
135 }
136 msg->len = sizeof(bl_msg) + dev->mincount;
137
138 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
139 add_wait_queue(&nn->bl_wq, &wq);
140 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
141 if (rc < 0) {
142 remove_wait_queue(&nn->bl_wq, &wq);
143 rv = ERR_PTR(rc);
144 goto out;
145 }
146
147 set_current_state(TASK_UNINTERRUPTIBLE);
148 schedule();
149 __set_current_state(TASK_RUNNING);
150 remove_wait_queue(&nn->bl_wq, &wq);
151
152 if (reply->status != BL_DEVICE_REQUEST_PROC) {
153 dprintk("%s failed to open device: %d\n",
154 __func__, reply->status);
155 rv = ERR_PTR(-EINVAL);
156 goto out;
157 }
158
159 bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
160 FMODE_READ, NULL);
161 if (IS_ERR(bd)) {
162 dprintk("%s failed to open device : %ld\n", __func__,
163 PTR_ERR(bd));
164 rv = ERR_CAST(bd);
165 goto out;
166 }
167
168 rv = kzalloc(sizeof(*rv), GFP_NOFS);
169 if (!rv) {
170 rv = ERR_PTR(-ENOMEM);
171 goto out;
172 }
173
174 rv->bm_mdev = bd;
175 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
176 rv->net = net;
177 dprintk("%s Created device %s with bd_block_size %u\n",
178 __func__,
179 bd->bd_disk->disk_name,
180 bd->bd_block_size);
181
182out:
183 kfree(msg->data);
184 return rv;
185}
186
187/* Map deviceid returned by the server to constructed block_device */
188static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
189 struct nfs4_deviceid *id)
190{
191 struct block_device *rv = NULL;
192 struct block_mount_id *mid;
193 struct pnfs_block_dev *dev;
194
195 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
196 mid = BLK_ID(lo);
197 spin_lock(&mid->bm_lock);
198 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
199 if (memcmp(id->data, dev->bm_mdevid.data,
200 NFS4_DEVICEID4_SIZE) == 0) {
201 rv = dev->bm_mdev;
202 goto out;
203 }
204 }
205 out:
206 spin_unlock(&mid->bm_lock);
207 dprintk("%s returning %p\n", __func__, rv);
208 return rv;
209}
210
211/* Tracks info needed to ensure extents in layout obey constraints of spec */
212struct layout_verification {
213 u32 mode; /* R or RW */
214 u64 start; /* Expected start of next non-COW extent */
215 u64 inval; /* Start of INVAL coverage */
216 u64 cowread; /* End of COW read coverage */
217};
218
219/* Verify the extent meets the layout requirements of the pnfs-block draft,
220 * section 2.3.1.
221 */
222static int verify_extent(struct pnfs_block_extent *be,
223 struct layout_verification *lv)
224{
225 if (lv->mode == IOMODE_READ) {
226 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
227 be->be_state == PNFS_BLOCK_INVALID_DATA)
228 return -EIO;
229 if (be->be_f_offset != lv->start)
230 return -EIO;
231 lv->start += be->be_length;
232 return 0;
233 }
234 /* lv->mode == IOMODE_RW */
235 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
236 if (be->be_f_offset != lv->start)
237 return -EIO;
238 if (lv->cowread > lv->start)
239 return -EIO;
240 lv->start += be->be_length;
241 lv->inval = lv->start;
242 return 0;
243 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
244 if (be->be_f_offset != lv->start)
245 return -EIO;
246 lv->start += be->be_length;
247 return 0;
248 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
249 if (be->be_f_offset > lv->start)
250 return -EIO;
251 if (be->be_f_offset < lv->inval)
252 return -EIO;
253 if (be->be_f_offset < lv->cowread)
254 return -EIO;
255 /* It looks like you might want to min this with lv->start,
256 * but you really don't.
257 */
258 lv->inval = lv->inval + be->be_length;
259 lv->cowread = be->be_f_offset + be->be_length;
260 return 0;
261 } else
262 return -EIO;
263}
264
265/* XDR decode pnfs_block_layout4 structure */
266int
267nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
268 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
269{
270 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
271 int i, status = -EIO;
272 uint32_t count;
273 struct pnfs_block_extent *be = NULL, *save;
274 struct xdr_stream stream;
275 struct xdr_buf buf;
276 struct page *scratch;
277 __be32 *p;
278 struct layout_verification lv = {
279 .mode = lgr->range.iomode,
280 .start = lgr->range.offset >> SECTOR_SHIFT,
281 .inval = lgr->range.offset >> SECTOR_SHIFT,
282 .cowread = lgr->range.offset >> SECTOR_SHIFT,
283 };
284 LIST_HEAD(extents);
285
286 dprintk("---> %s\n", __func__);
287
288 scratch = alloc_page(gfp_flags);
289 if (!scratch)
290 return -ENOMEM;
291
292 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
293 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
294
295 p = xdr_inline_decode(&stream, 4);
296 if (unlikely(!p))
297 goto out_err;
298
299 count = be32_to_cpup(p++);
300
301 dprintk("%s enter, number of extents %i\n", __func__, count);
302 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
303 if (unlikely(!p))
304 goto out_err;
305
306 /* Decode individual extents, putting them in temporary
307 * staging area until whole layout is decoded to make error
308 * recovery easier.
309 */
310 for (i = 0; i < count; i++) {
311 be = bl_alloc_extent();
312 if (!be) {
313 status = -ENOMEM;
314 goto out_err;
315 }
316 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
317 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
318 be->be_mdev = translate_devid(lo, &be->be_devid);
319 if (!be->be_mdev)
320 goto out_err;
321
322 /* The next three values are read in as bytes,
323 * but stored as 512-byte sector lengths
324 */
325 if (decode_sector_number(&p, &be->be_f_offset) < 0)
326 goto out_err;
327 if (decode_sector_number(&p, &be->be_length) < 0)
328 goto out_err;
329 if (decode_sector_number(&p, &be->be_v_offset) < 0)
330 goto out_err;
331 be->be_state = be32_to_cpup(p++);
332 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
333 be->be_inval = &bl->bl_inval;
334 if (verify_extent(be, &lv)) {
335 dprintk("%s verify failed\n", __func__);
336 goto out_err;
337 }
338 list_add_tail(&be->be_node, &extents);
339 }
340 if (lgr->range.offset + lgr->range.length !=
341 lv.start << SECTOR_SHIFT) {
342 dprintk("%s Final length mismatch\n", __func__);
343 be = NULL;
344 goto out_err;
345 }
346 if (lv.start < lv.cowread) {
347 dprintk("%s Final uncovered COW extent\n", __func__);
348 be = NULL;
349 goto out_err;
350 }
351 /* Extents decoded properly, now try to merge them in to
352 * existing layout extents.
353 */
354 spin_lock(&bl->bl_ext_lock);
355 list_for_each_entry_safe(be, save, &extents, be_node) {
356 list_del(&be->be_node);
357 status = bl_add_merge_extent(bl, be);
358 if (status) {
359 spin_unlock(&bl->bl_ext_lock);
360 /* This is a fairly catastrophic error, as the
361 * entire layout extent lists are now corrupted.
362 * We should have some way to distinguish this.
363 */
364 be = NULL;
365 goto out_err;
366 }
367 }
368 spin_unlock(&bl->bl_ext_lock);
369 status = 0;
370 out:
371 __free_page(scratch);
372 dprintk("%s returns %i\n", __func__, status);
373 return status;
374
375 out_err:
376 bl_put_extent(be);
377 while (!list_empty(&extents)) {
378 be = list_first_entry(&extents, struct pnfs_block_extent,
379 be_node);
380 list_del(&be->be_node);
381 bl_put_extent(be);
382 }
383 goto out;
384}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(struct net *net, dev_t dev)
42{
43 struct bl_pipe_msg bl_pipe_msg;
44 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
45 struct bl_dev_msg bl_umount_request;
46 struct bl_msg_hdr bl_msg = {
47 .type = BL_DEVICE_UMOUNT,
48 .totallen = sizeof(bl_umount_request),
49 };
50 uint8_t *dataptr;
51 DECLARE_WAITQUEUE(wq, current);
52 struct nfs_net *nn = net_generic(net, nfs_net_id);
53
54 dprintk("Entering %s\n", __func__);
55
56 bl_pipe_msg.bl_wq = &nn->bl_wq;
57 memset(msg, 0, sizeof(*msg));
58 msg->len = sizeof(bl_msg) + bl_msg.totallen;
59 msg->data = kzalloc(msg->len, GFP_NOFS);
60 if (!msg->data)
61 goto out;
62
63 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
64 bl_umount_request.major = MAJOR(dev);
65 bl_umount_request.minor = MINOR(dev);
66
67 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
68 dataptr = (uint8_t *) msg->data;
69 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
70
71 add_wait_queue(&nn->bl_wq, &wq);
72 if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
73 remove_wait_queue(&nn->bl_wq, &wq);
74 goto out;
75 }
76
77 set_current_state(TASK_UNINTERRUPTIBLE);
78 schedule();
79 __set_current_state(TASK_RUNNING);
80 remove_wait_queue(&nn->bl_wq, &wq);
81
82out:
83 kfree(msg->data);
84}
85
86/*
87 * Release meta device
88 */
89static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
90{
91 dprintk("%s Releasing\n", __func__);
92 nfs4_blkdev_put(bdev->bm_mdev);
93 dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
94}
95
96void bl_free_block_dev(struct pnfs_block_dev *bdev)
97{
98 if (bdev) {
99 if (bdev->bm_mdev) {
100 dprintk("%s Removing DM device: %d:%d\n",
101 __func__,
102 MAJOR(bdev->bm_mdev->bd_dev),
103 MINOR(bdev->bm_mdev->bd_dev));
104 nfs4_blk_metadev_release(bdev);
105 }
106 kfree(bdev);
107 }
108}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/blkdev.h>
6#include <linux/nfs4.h>
7#include <linux/nfs_fs.h>
8#include <linux/nfs_xdr.h>
9
10#include "blocklayout.h"
11
12#define NFSDBG_FACILITY NFSDBG_PNFS_LD
13
14static void
15bl_free_device(struct pnfs_block_dev *dev)
16{
17 if (dev->nr_children) {
18 int i;
19
20 for (i = 0; i < dev->nr_children; i++)
21 bl_free_device(&dev->children[i]);
22 kfree(dev->children);
23 } else {
24 if (dev->bdev)
25 blkdev_put(dev->bdev, FMODE_READ);
26 }
27}
28
29void
30bl_free_deviceid_node(struct nfs4_deviceid_node *d)
31{
32 struct pnfs_block_dev *dev =
33 container_of(d, struct pnfs_block_dev, node);
34
35 bl_free_device(dev);
36 kfree(dev);
37}
38
39static int
40nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
41{
42 __be32 *p;
43 int i;
44
45 p = xdr_inline_decode(xdr, 4);
46 if (!p)
47 return -EIO;
48 b->type = be32_to_cpup(p++);
49
50 switch (b->type) {
51 case PNFS_BLOCK_VOLUME_SIMPLE:
52 p = xdr_inline_decode(xdr, 4);
53 if (!p)
54 return -EIO;
55 b->simple.nr_sigs = be32_to_cpup(p++);
56 if (!b->simple.nr_sigs) {
57 dprintk("no signature\n");
58 return -EIO;
59 }
60
61 b->simple.len = 4 + 4;
62 for (i = 0; i < b->simple.nr_sigs; i++) {
63 p = xdr_inline_decode(xdr, 8 + 4);
64 if (!p)
65 return -EIO;
66 p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
67 b->simple.sigs[i].sig_len = be32_to_cpup(p++);
68
69 p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
70 if (!p)
71 return -EIO;
72 memcpy(&b->simple.sigs[i].sig, p,
73 b->simple.sigs[i].sig_len);
74
75 b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
76 }
77 break;
78 case PNFS_BLOCK_VOLUME_SLICE:
79 p = xdr_inline_decode(xdr, 8 + 8 + 4);
80 if (!p)
81 return -EIO;
82 p = xdr_decode_hyper(p, &b->slice.start);
83 p = xdr_decode_hyper(p, &b->slice.len);
84 b->slice.volume = be32_to_cpup(p++);
85 break;
86 case PNFS_BLOCK_VOLUME_CONCAT:
87 p = xdr_inline_decode(xdr, 4);
88 if (!p)
89 return -EIO;
90 b->concat.volumes_count = be32_to_cpup(p++);
91
92 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
93 if (!p)
94 return -EIO;
95 for (i = 0; i < b->concat.volumes_count; i++)
96 b->concat.volumes[i] = be32_to_cpup(p++);
97 break;
98 case PNFS_BLOCK_VOLUME_STRIPE:
99 p = xdr_inline_decode(xdr, 8 + 4);
100 if (!p)
101 return -EIO;
102 p = xdr_decode_hyper(p, &b->stripe.chunk_size);
103 b->stripe.volumes_count = be32_to_cpup(p++);
104
105 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
106 if (!p)
107 return -EIO;
108 for (i = 0; i < b->stripe.volumes_count; i++)
109 b->stripe.volumes[i] = be32_to_cpup(p++);
110 break;
111 default:
112 dprintk("unknown volume type!\n");
113 return -EIO;
114 }
115
116 return 0;
117}
118
119static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
120 struct pnfs_block_dev_map *map)
121{
122 map->start = dev->start;
123 map->len = dev->len;
124 map->disk_offset = dev->disk_offset;
125 map->bdev = dev->bdev;
126 return true;
127}
128
129static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
130 struct pnfs_block_dev_map *map)
131{
132 int i;
133
134 for (i = 0; i < dev->nr_children; i++) {
135 struct pnfs_block_dev *child = &dev->children[i];
136
137 if (child->start > offset ||
138 child->start + child->len <= offset)
139 continue;
140
141 child->map(child, offset - child->start, map);
142 return true;
143 }
144
145 dprintk("%s: ran off loop!\n", __func__);
146 return false;
147}
148
149static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
150 struct pnfs_block_dev_map *map)
151{
152 struct pnfs_block_dev *child;
153 u64 chunk;
154 u32 chunk_idx;
155 u64 disk_offset;
156
157 chunk = div_u64(offset, dev->chunk_size);
158 div_u64_rem(chunk, dev->nr_children, &chunk_idx);
159
160 if (chunk_idx > dev->nr_children) {
161 dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
162 __func__, chunk_idx, offset, dev->chunk_size);
163 /* error, should not happen */
164 return false;
165 }
166
167 /* truncate offset to the beginning of the stripe */
168 offset = chunk * dev->chunk_size;
169
170 /* disk offset of the stripe */
171 disk_offset = div_u64(offset, dev->nr_children);
172
173 child = &dev->children[chunk_idx];
174 child->map(child, disk_offset, map);
175
176 map->start += offset;
177 map->disk_offset += disk_offset;
178 map->len = dev->chunk_size;
179 return true;
180}
181
182static int
183bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
184 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
185
186
187static int
188bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
189 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
190{
191 struct pnfs_block_volume *v = &volumes[idx];
192 dev_t dev;
193
194 dev = bl_resolve_deviceid(server, v, gfp_mask);
195 if (!dev)
196 return -EIO;
197
198 d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
199 if (IS_ERR(d->bdev)) {
200 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
201 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
202 return PTR_ERR(d->bdev);
203 }
204
205
206 d->len = i_size_read(d->bdev->bd_inode);
207 d->map = bl_map_simple;
208
209 printk(KERN_INFO "pNFS: using block device %s\n",
210 d->bdev->bd_disk->disk_name);
211 return 0;
212}
213
214static int
215bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
216 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
217{
218 struct pnfs_block_volume *v = &volumes[idx];
219 int ret;
220
221 ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
222 if (ret)
223 return ret;
224
225 d->disk_offset = v->slice.start;
226 d->len = v->slice.len;
227 return 0;
228}
229
230static int
231bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
232 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
233{
234 struct pnfs_block_volume *v = &volumes[idx];
235 u64 len = 0;
236 int ret, i;
237
238 d->children = kcalloc(v->concat.volumes_count,
239 sizeof(struct pnfs_block_dev), GFP_KERNEL);
240 if (!d->children)
241 return -ENOMEM;
242
243 for (i = 0; i < v->concat.volumes_count; i++) {
244 ret = bl_parse_deviceid(server, &d->children[i],
245 volumes, v->concat.volumes[i], gfp_mask);
246 if (ret)
247 return ret;
248
249 d->nr_children++;
250 d->children[i].start += len;
251 len += d->children[i].len;
252 }
253
254 d->len = len;
255 d->map = bl_map_concat;
256 return 0;
257}
258
259static int
260bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
261 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
262{
263 struct pnfs_block_volume *v = &volumes[idx];
264 u64 len = 0;
265 int ret, i;
266
267 d->children = kcalloc(v->stripe.volumes_count,
268 sizeof(struct pnfs_block_dev), GFP_KERNEL);
269 if (!d->children)
270 return -ENOMEM;
271
272 for (i = 0; i < v->stripe.volumes_count; i++) {
273 ret = bl_parse_deviceid(server, &d->children[i],
274 volumes, v->stripe.volumes[i], gfp_mask);
275 if (ret)
276 return ret;
277
278 d->nr_children++;
279 len += d->children[i].len;
280 }
281
282 d->len = len;
283 d->chunk_size = v->stripe.chunk_size;
284 d->map = bl_map_stripe;
285 return 0;
286}
287
288static int
289bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
290 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
291{
292 switch (volumes[idx].type) {
293 case PNFS_BLOCK_VOLUME_SIMPLE:
294 return bl_parse_simple(server, d, volumes, idx, gfp_mask);
295 case PNFS_BLOCK_VOLUME_SLICE:
296 return bl_parse_slice(server, d, volumes, idx, gfp_mask);
297 case PNFS_BLOCK_VOLUME_CONCAT:
298 return bl_parse_concat(server, d, volumes, idx, gfp_mask);
299 case PNFS_BLOCK_VOLUME_STRIPE:
300 return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
301 default:
302 dprintk("unsupported volume type: %d\n", volumes[idx].type);
303 return -EIO;
304 }
305}
306
307struct nfs4_deviceid_node *
308bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
309 gfp_t gfp_mask)
310{
311 struct nfs4_deviceid_node *node = NULL;
312 struct pnfs_block_volume *volumes;
313 struct pnfs_block_dev *top;
314 struct xdr_stream xdr;
315 struct xdr_buf buf;
316 struct page *scratch;
317 int nr_volumes, ret, i;
318 __be32 *p;
319
320 scratch = alloc_page(gfp_mask);
321 if (!scratch)
322 goto out;
323
324 xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
325 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
326
327 p = xdr_inline_decode(&xdr, sizeof(__be32));
328 if (!p)
329 goto out_free_scratch;
330 nr_volumes = be32_to_cpup(p++);
331
332 volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
333 gfp_mask);
334 if (!volumes)
335 goto out_free_scratch;
336
337 for (i = 0; i < nr_volumes; i++) {
338 ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
339 if (ret < 0)
340 goto out_free_volumes;
341 }
342
343 top = kzalloc(sizeof(*top), gfp_mask);
344 if (!top)
345 goto out_free_volumes;
346
347 ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
348 if (ret) {
349 bl_free_device(top);
350 kfree(top);
351 goto out_free_volumes;
352 }
353
354 node = &top->node;
355 nfs4_init_deviceid_node(node, server, &pdev->dev_id);
356
357out_free_volumes:
358 kfree(volumes);
359out_free_scratch:
360 __free_page(scratch);
361out:
362 return node;
363}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4
5#include <linux/vmalloc.h>
6
7#include "blocklayout.h"
8
9#define NFSDBG_FACILITY NFSDBG_PNFS_LD
10
11static inline struct pnfs_block_extent *
12ext_node(struct rb_node *node)
13{
14 return rb_entry(node, struct pnfs_block_extent, be_node);
15}
16
17static struct pnfs_block_extent *
18ext_tree_first(struct rb_root *root)
19{
20 struct rb_node *node = rb_first(root);
21 return node ? ext_node(node) : NULL;
22}
23
24static struct pnfs_block_extent *
25ext_tree_prev(struct pnfs_block_extent *be)
26{
27 struct rb_node *node = rb_prev(&be->be_node);
28 return node ? ext_node(node) : NULL;
29}
30
31static struct pnfs_block_extent *
32ext_tree_next(struct pnfs_block_extent *be)
33{
34 struct rb_node *node = rb_next(&be->be_node);
35 return node ? ext_node(node) : NULL;
36}
37
38static inline sector_t
39ext_f_end(struct pnfs_block_extent *be)
40{
41 return be->be_f_offset + be->be_length;
42}
43
44static struct pnfs_block_extent *
45__ext_tree_search(struct rb_root *root, sector_t start)
46{
47 struct rb_node *node = root->rb_node;
48 struct pnfs_block_extent *be = NULL;
49
50 while (node) {
51 be = ext_node(node);
52 if (start < be->be_f_offset)
53 node = node->rb_left;
54 else if (start >= ext_f_end(be))
55 node = node->rb_right;
56 else
57 return be;
58 }
59
60 if (be) {
61 if (start < be->be_f_offset)
62 return be;
63
64 if (start >= ext_f_end(be))
65 return ext_tree_next(be);
66 }
67
68 return NULL;
69}
70
71static bool
72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
73{
74 if (be1->be_state != be2->be_state)
75 return false;
76 if (be1->be_device != be2->be_device)
77 return false;
78
79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
80 return false;
81
82 if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
83 (be1->be_v_offset + be1->be_length != be2->be_v_offset))
84 return false;
85
86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
87 be1->be_tag != be2->be_tag)
88 return false;
89
90 return true;
91}
92
93static struct pnfs_block_extent *
94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
95{
96 struct pnfs_block_extent *left = ext_tree_prev(be);
97
98 if (left && ext_can_merge(left, be)) {
99 left->be_length += be->be_length;
100 rb_erase(&be->be_node, root);
101 nfs4_put_deviceid_node(be->be_device);
102 kfree(be);
103 return left;
104 }
105
106 return be;
107}
108
109static struct pnfs_block_extent *
110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
111{
112 struct pnfs_block_extent *right = ext_tree_next(be);
113
114 if (right && ext_can_merge(be, right)) {
115 be->be_length += right->be_length;
116 rb_erase(&right->be_node, root);
117 nfs4_put_deviceid_node(right->be_device);
118 kfree(right);
119 }
120
121 return be;
122}
123
124static void
125__ext_tree_insert(struct rb_root *root,
126 struct pnfs_block_extent *new, bool merge_ok)
127{
128 struct rb_node **p = &root->rb_node, *parent = NULL;
129 struct pnfs_block_extent *be;
130
131 while (*p) {
132 parent = *p;
133 be = ext_node(parent);
134
135 if (new->be_f_offset < be->be_f_offset) {
136 if (merge_ok && ext_can_merge(new, be)) {
137 be->be_f_offset = new->be_f_offset;
138 if (be->be_state != PNFS_BLOCK_NONE_DATA)
139 be->be_v_offset = new->be_v_offset;
140 be->be_length += new->be_length;
141 be = ext_try_to_merge_left(root, be);
142 goto free_new;
143 }
144 p = &(*p)->rb_left;
145 } else if (new->be_f_offset >= ext_f_end(be)) {
146 if (merge_ok && ext_can_merge(be, new)) {
147 be->be_length += new->be_length;
148 be = ext_try_to_merge_right(root, be);
149 goto free_new;
150 }
151 p = &(*p)->rb_right;
152 } else {
153 BUG();
154 }
155 }
156
157 rb_link_node(&new->be_node, parent, p);
158 rb_insert_color(&new->be_node, root);
159 return;
160free_new:
161 nfs4_put_deviceid_node(new->be_device);
162 kfree(new);
163}
164
165static int
166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
167{
168 struct pnfs_block_extent *be;
169 sector_t len1 = 0, len2 = 0;
170 sector_t orig_v_offset;
171 sector_t orig_len;
172
173 be = __ext_tree_search(root, start);
174 if (!be)
175 return 0;
176 if (be->be_f_offset >= end)
177 return 0;
178
179 orig_v_offset = be->be_v_offset;
180 orig_len = be->be_length;
181
182 if (start > be->be_f_offset)
183 len1 = start - be->be_f_offset;
184 if (ext_f_end(be) > end)
185 len2 = ext_f_end(be) - end;
186
187 if (len2 > 0) {
188 if (len1 > 0) {
189 struct pnfs_block_extent *new;
190
191 new = kzalloc(sizeof(*new), GFP_ATOMIC);
192 if (!new)
193 return -ENOMEM;
194
195 be->be_length = len1;
196
197 new->be_f_offset = end;
198 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
199 new->be_v_offset =
200 orig_v_offset + orig_len - len2;
201 }
202 new->be_length = len2;
203 new->be_state = be->be_state;
204 new->be_tag = be->be_tag;
205 new->be_device = nfs4_get_deviceid(be->be_device);
206
207 __ext_tree_insert(root, new, true);
208 } else {
209 be->be_f_offset = end;
210 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
211 be->be_v_offset =
212 orig_v_offset + orig_len - len2;
213 }
214 be->be_length = len2;
215 }
216 } else {
217 if (len1 > 0) {
218 be->be_length = len1;
219 be = ext_tree_next(be);
220 }
221
222 while (be && ext_f_end(be) <= end) {
223 struct pnfs_block_extent *next = ext_tree_next(be);
224
225 rb_erase(&be->be_node, root);
226 nfs4_put_deviceid_node(be->be_device);
227 kfree(be);
228 be = next;
229 }
230
231 if (be && be->be_f_offset < end) {
232 len1 = ext_f_end(be) - end;
233 be->be_f_offset = end;
234 if (be->be_state != PNFS_BLOCK_NONE_DATA)
235 be->be_v_offset += be->be_length - len1;
236 be->be_length = len1;
237 }
238 }
239
240 return 0;
241}
242
243int
244ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
245{
246 struct pnfs_block_extent *be;
247 struct rb_root *root;
248 int err = 0;
249
250 switch (new->be_state) {
251 case PNFS_BLOCK_READWRITE_DATA:
252 case PNFS_BLOCK_INVALID_DATA:
253 root = &bl->bl_ext_rw;
254 break;
255 case PNFS_BLOCK_READ_DATA:
256 case PNFS_BLOCK_NONE_DATA:
257 root = &bl->bl_ext_ro;
258 break;
259 default:
260 dprintk("invalid extent type\n");
261 return -EINVAL;
262 }
263
264 spin_lock(&bl->bl_ext_lock);
265retry:
266 be = __ext_tree_search(root, new->be_f_offset);
267 if (!be || be->be_f_offset >= ext_f_end(new)) {
268 __ext_tree_insert(root, new, true);
269 } else if (new->be_f_offset >= be->be_f_offset) {
270 if (ext_f_end(new) <= ext_f_end(be)) {
271 nfs4_put_deviceid_node(new->be_device);
272 kfree(new);
273 } else {
274 sector_t new_len = ext_f_end(new) - ext_f_end(be);
275 sector_t diff = new->be_length - new_len;
276
277 new->be_f_offset += diff;
278 new->be_v_offset += diff;
279 new->be_length = new_len;
280 goto retry;
281 }
282 } else if (ext_f_end(new) <= ext_f_end(be)) {
283 new->be_length = be->be_f_offset - new->be_f_offset;
284 __ext_tree_insert(root, new, true);
285 } else {
286 struct pnfs_block_extent *split;
287 sector_t new_len = ext_f_end(new) - ext_f_end(be);
288 sector_t diff = new->be_length - new_len;
289
290 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
291 if (!split) {
292 err = -EINVAL;
293 goto out;
294 }
295
296 split->be_length = be->be_f_offset - split->be_f_offset;
297 split->be_device = nfs4_get_deviceid(new->be_device);
298 __ext_tree_insert(root, split, true);
299
300 new->be_f_offset += diff;
301 new->be_v_offset += diff;
302 new->be_length = new_len;
303 goto retry;
304 }
305out:
306 spin_unlock(&bl->bl_ext_lock);
307 return err;
308}
309
310static bool
311__ext_tree_lookup(struct rb_root *root, sector_t isect,
312 struct pnfs_block_extent *ret)
313{
314 struct rb_node *node;
315 struct pnfs_block_extent *be;
316
317 node = root->rb_node;
318 while (node) {
319 be = ext_node(node);
320 if (isect < be->be_f_offset)
321 node = node->rb_left;
322 else if (isect >= ext_f_end(be))
323 node = node->rb_right;
324 else {
325 *ret = *be;
326 return true;
327 }
328 }
329
330 return false;
331}
332
333bool
334ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
335 struct pnfs_block_extent *ret, bool rw)
336{
337 bool found = false;
338
339 spin_lock(&bl->bl_ext_lock);
340 if (!rw)
341 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
342 if (!found)
343 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
344 spin_unlock(&bl->bl_ext_lock);
345
346 return found;
347}
348
349int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
350 sector_t start, sector_t end)
351{
352 int err, err2;
353
354 spin_lock(&bl->bl_ext_lock);
355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
356 if (rw) {
357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
358 if (!err)
359 err = err2;
360 }
361 spin_unlock(&bl->bl_ext_lock);
362
363 return err;
364}
365
366static int
367ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
368 sector_t split)
369{
370 struct pnfs_block_extent *new;
371 sector_t orig_len = be->be_length;
372
373 new = kzalloc(sizeof(*new), GFP_ATOMIC);
374 if (!new)
375 return -ENOMEM;
376
377 be->be_length = split - be->be_f_offset;
378
379 new->be_f_offset = split;
380 if (be->be_state != PNFS_BLOCK_NONE_DATA)
381 new->be_v_offset = be->be_v_offset + be->be_length;
382 new->be_length = orig_len - be->be_length;
383 new->be_state = be->be_state;
384 new->be_tag = be->be_tag;
385 new->be_device = nfs4_get_deviceid(be->be_device);
386
387 __ext_tree_insert(root, new, false);
388 return 0;
389}
390
391int
392ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
393 sector_t len)
394{
395 struct rb_root *root = &bl->bl_ext_rw;
396 sector_t end = start + len;
397 struct pnfs_block_extent *be;
398 int err = 0;
399
400 spin_lock(&bl->bl_ext_lock);
401 /*
402 * First remove all COW extents or holes from written to range.
403 */
404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
405 if (err)
406 goto out;
407
408 /*
409 * Then mark all invalid extents in the range as written to.
410 */
411 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
412 if (be->be_f_offset >= end)
413 break;
414
415 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
416 continue;
417
418 if (be->be_f_offset < start) {
419 struct pnfs_block_extent *left = ext_tree_prev(be);
420
421 if (left && ext_can_merge(left, be)) {
422 sector_t diff = start - be->be_f_offset;
423
424 left->be_length += diff;
425
426 be->be_f_offset += diff;
427 be->be_v_offset += diff;
428 be->be_length -= diff;
429 } else {
430 err = ext_tree_split(root, be, start);
431 if (err)
432 goto out;
433 }
434 }
435
436 if (ext_f_end(be) > end) {
437 struct pnfs_block_extent *right = ext_tree_next(be);
438
439 if (right && ext_can_merge(be, right)) {
440 sector_t diff = end - be->be_f_offset;
441
442 be->be_length -= diff;
443
444 right->be_f_offset -= diff;
445 right->be_v_offset -= diff;
446 right->be_length += diff;
447 } else {
448 err = ext_tree_split(root, be, end);
449 if (err)
450 goto out;
451 }
452 }
453
454 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
455 be->be_tag = EXTENT_WRITTEN;
456 be = ext_try_to_merge_left(root, be);
457 be = ext_try_to_merge_right(root, be);
458 }
459 }
460out:
461 spin_unlock(&bl->bl_ext_lock);
462 return err;
463}
464
465static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
466 size_t buffer_size)
467{
468 if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
469 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
470
471 for (i = 0; i < nr_pages; i++)
472 put_page(arg->layoutupdate_pages[i]);
473 kfree(arg->layoutupdate_pages);
474 } else {
475 put_page(arg->layoutupdate_page);
476 }
477}
478
479static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
480 size_t buffer_size, size_t *count)
481{
482 struct pnfs_block_extent *be;
483 int ret = 0;
484
485 spin_lock(&bl->bl_ext_lock);
486 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
487 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
488 be->be_tag != EXTENT_WRITTEN)
489 continue;
490
491 (*count)++;
492 if (*count * BL_EXTENT_SIZE > buffer_size) {
493 /* keep counting.. */
494 ret = -ENOSPC;
495 continue;
496 }
497
498 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
499 NFS4_DEVICEID4_SIZE);
500 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
501 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
502 p = xdr_encode_hyper(p, 0LL);
503 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
504
505 be->be_tag = EXTENT_COMMITTING;
506 }
507 spin_unlock(&bl->bl_ext_lock);
508
509 return ret;
510}
511
512int
513ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
514{
515 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
516 size_t count = 0, buffer_size = PAGE_SIZE;
517 __be32 *start_p;
518 int ret;
519
520 dprintk("%s enter\n", __func__);
521
522 arg->layoutupdate_page = alloc_page(GFP_NOFS);
523 if (!arg->layoutupdate_page)
524 return -ENOMEM;
525 start_p = page_address(arg->layoutupdate_page);
526 arg->layoutupdate_pages = &arg->layoutupdate_page;
527
528retry:
529 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
530 if (unlikely(ret)) {
531 ext_tree_free_commitdata(arg, buffer_size);
532
533 buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
534 count = 0;
535
536 arg->layoutupdate_pages =
537 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
538 sizeof(struct page *), GFP_NOFS);
539 if (!arg->layoutupdate_pages)
540 return -ENOMEM;
541
542 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
543 if (!start_p) {
544 kfree(arg->layoutupdate_pages);
545 return -ENOMEM;
546 }
547
548 goto retry;
549 }
550
551 *start_p = cpu_to_be32(count);
552 arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
553
554 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
555 __be32 *p = start_p;
556 int i = 0;
557
558 for (p = start_p;
559 p < start_p + arg->layoutupdate_len;
560 p += PAGE_SIZE) {
561 arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
562 }
563 }
564
565 dprintk("%s found %zu ranges\n", __func__, count);
566 return 0;
567}
568
569void
570ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
571{
572 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
573 struct rb_root *root = &bl->bl_ext_rw;
574 struct pnfs_block_extent *be;
575
576 dprintk("%s status %d\n", __func__, status);
577
578 ext_tree_free_commitdata(arg, arg->layoutupdate_len);
579
580 spin_lock(&bl->bl_ext_lock);
581 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
582 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
583 be->be_tag != EXTENT_COMMITTING)
584 continue;
585
586 if (status) {
587 /*
588 * Mark as written and try again.
589 *
590 * XXX: some real error handling here wouldn't hurt..
591 */
592 be->be_tag = EXTENT_WRITTEN;
593 } else {
594 be->be_state = PNFS_BLOCK_READWRITE_DATA;
595 be->be_tag = 0;
596 }
597
598 be = ext_try_to_merge_left(root, be);
599 be = ext_try_to_merge_right(root, be);
600 }
601 spin_unlock(&bl->bl_ext_lock);
602}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - sector_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 new = storage;
114 new->it_sector = s;
115 new->it_tags = (1 << tag);
116 list_add(&new->it_link, &pos->it_link);
117 return 1;
118 }
119}
120
121/* XXXX Really want option to not create */
122/* Over range, unions tag with existing entries, else creates entry with tag */
123static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
124{
125 u64 i;
126
127 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
128 for (i = normalize(s, tree->mtt_step_size); i < s + length;
129 i += tree->mtt_step_size)
130 if (_add_entry(tree, i, tag, NULL))
131 return -ENOMEM;
132 return 0;
133}
134
135/* Ensure that future operations on given range of tree will not malloc */
136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
138{
139 u64 start, end, s;
140 int count, i, used = 0, status = -ENOMEM;
141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
143
144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
145 start = normalize(offset, tree->mtt_step_size);
146 end = normalize_up(offset + length, tree->mtt_step_size);
147 count = (int)(end - start) / (int)tree->mtt_step_size;
148
149 /* Pre-malloc what memory we might need */
150 storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
151 if (!storage)
152 return -ENOMEM;
153 for (i = 0; i < count; i++) {
154 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
155 GFP_NOFS);
156 if (!storage[i])
157 goto out_cleanup;
158 }
159
160 spin_lock_bh(&marks->im_lock);
161 for (s = start; s < end; s += tree->mtt_step_size)
162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
164
165 status = 0;
166
167 out_cleanup:
168 for (i = used; i < count; i++) {
169 if (!storage[i])
170 break;
171 kfree(storage[i]);
172 }
173 kfree(storage);
174 return status;
175}
176
177/* We are relying on page lock to serialize this */
178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
179{
180 int rv;
181
182 spin_lock_bh(&marks->im_lock);
183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184 spin_unlock_bh(&marks->im_lock);
185 return rv;
186}
187
188/* Assume start, end already sector aligned */
189static int
190_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
191{
192 struct pnfs_inval_tracking *pos;
193 u64 expect = 0;
194
195 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
196 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
197 if (pos->it_sector >= end)
198 continue;
199 if (!expect) {
200 if ((pos->it_sector == end - tree->mtt_step_size) &&
201 (pos->it_tags & (1 << tag))) {
202 expect = pos->it_sector - tree->mtt_step_size;
203 if (pos->it_sector < tree->mtt_step_size || expect < start)
204 return 1;
205 continue;
206 } else {
207 return 0;
208 }
209 }
210 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
211 return 0;
212 expect -= tree->mtt_step_size;
213 if (expect < start)
214 return 1;
215 }
216 return 0;
217}
218
219static int is_range_written(struct pnfs_inval_markings *marks,
220 sector_t start, sector_t end)
221{
222 int rv;
223
224 spin_lock_bh(&marks->im_lock);
225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226 spin_unlock_bh(&marks->im_lock);
227 return rv;
228}
229
230/* Marks sectors in [offest, offset_length) as having been initialized.
231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
232 * Currently assumes offset is page-aligned
233 */
234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
235 sector_t offset, sector_t length)
236{
237 sector_t start, end;
238
239 dprintk("%s(offset=%llu,len=%llu) enter\n",
240 __func__, (u64)offset, (u64)length);
241
242 start = normalize(offset, marks->im_block_size);
243 end = normalize_up(offset + length, marks->im_block_size);
244 if (_preload_range(marks, start, end - start))
245 goto outerr;
246
247 spin_lock_bh(&marks->im_lock);
248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 goto out_unlock;
250 spin_unlock_bh(&marks->im_lock);
251
252 return 0;
253
254out_unlock:
255 spin_unlock_bh(&marks->im_lock);
256outerr:
257 return -ENOMEM;
258}
259
260/* Marks sectors in [offest, offset+length) as having been written to disk.
261 * All lengths should be block aligned.
262 */
263static int mark_written_sectors(struct pnfs_inval_markings *marks,
264 sector_t offset, sector_t length)
265{
266 int status;
267
268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 (u64)offset, (u64)length);
270 spin_lock_bh(&marks->im_lock);
271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272 spin_unlock_bh(&marks->im_lock);
273 return status;
274}
275
276static void print_short_extent(struct pnfs_block_short_extent *be)
277{
278 dprintk("PRINT SHORT EXTENT extent %p\n", be);
279 if (be) {
280 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
281 dprintk(" be_length %llu\n", (u64)be->bse_length);
282 }
283}
284
285static void print_clist(struct list_head *list, unsigned int count)
286{
287 struct pnfs_block_short_extent *be;
288 unsigned int i = 0;
289
290 ifdebug(FACILITY) {
291 printk(KERN_DEBUG "****************\n");
292 printk(KERN_DEBUG "Extent list looks like:\n");
293 list_for_each_entry(be, list, bse_node) {
294 i++;
295 print_short_extent(be);
296 }
297 if (i != count)
298 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
299 printk(KERN_DEBUG "****************\n");
300 }
301}
302
303/* Note: In theory, we should do more checking that devid's match between
304 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
305 */
306/* Note this is very similar to bl_add_merge_extent */
307static void add_to_commitlist(struct pnfs_block_layout *bl,
308 struct pnfs_block_short_extent *new)
309{
310 struct list_head *clist = &bl->bl_commit;
311 struct pnfs_block_short_extent *old, *save;
312 sector_t end = new->bse_f_offset + new->bse_length;
313
314 dprintk("%s enter\n", __func__);
315 print_short_extent(new);
316 print_clist(clist, bl->bl_count);
317 bl->bl_count++;
318 /* Scan for proper place to insert, extending new to the left
319 * as much as possible.
320 */
321 list_for_each_entry_safe(old, save, clist, bse_node) {
322 if (new->bse_f_offset < old->bse_f_offset)
323 break;
324 if (end <= old->bse_f_offset + old->bse_length) {
325 /* Range is already in list */
326 bl->bl_count--;
327 kfree(new);
328 return;
329 } else if (new->bse_f_offset <=
330 old->bse_f_offset + old->bse_length) {
331 /* new overlaps or abuts existing be */
332 if (new->bse_mdev == old->bse_mdev) {
333 /* extend new to fully replace old */
334 new->bse_length += new->bse_f_offset -
335 old->bse_f_offset;
336 new->bse_f_offset = old->bse_f_offset;
337 list_del(&old->bse_node);
338 bl->bl_count--;
339 kfree(old);
340 }
341 }
342 }
343 /* Note that if we never hit the above break, old will not point to a
344 * valid extent. However, in that case &old->bse_node==list.
345 */
346 list_add_tail(&new->bse_node, &old->bse_node);
347 /* Scan forward for overlaps. If we find any, extend new and
348 * remove the overlapped extent.
349 */
350 old = list_prepare_entry(new, clist, bse_node);
351 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
352 if (end < old->bse_f_offset)
353 break;
354 /* new overlaps or abuts old */
355 if (new->bse_mdev == old->bse_mdev) {
356 if (end < old->bse_f_offset + old->bse_length) {
357 /* extend new to fully cover old */
358 end = old->bse_f_offset + old->bse_length;
359 new->bse_length = end - new->bse_f_offset;
360 }
361 list_del(&old->bse_node);
362 bl->bl_count--;
363 kfree(old);
364 }
365 }
366 dprintk("%s: after merging\n", __func__);
367 print_clist(clist, bl->bl_count);
368}
369
370/* Note the range described by offset, length is guaranteed to be contained
371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
374 */
375int bl_mark_for_commit(struct pnfs_block_extent *be,
376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
378{
379 sector_t new_end, end = offset + length;
380 struct pnfs_block_layout *bl = container_of(be->be_inval,
381 struct pnfs_block_layout,
382 bl_inval);
383
384 mark_written_sectors(be->be_inval, offset, length);
385 /* We want to add the range to commit list, but it must be
386 * block-normalized, and verified that the normalized range has
387 * been entirely written to disk.
388 */
389 new->bse_f_offset = offset;
390 offset = normalize(offset, bl->bl_blocksize);
391 if (offset < new->bse_f_offset) {
392 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
393 new->bse_f_offset = offset;
394 else
395 new->bse_f_offset = offset + bl->bl_blocksize;
396 }
397 new_end = normalize_up(end, bl->bl_blocksize);
398 if (end < new_end) {
399 if (is_range_written(be->be_inval, end, new_end))
400 end = new_end;
401 else
402 end = new_end - bl->bl_blocksize;
403 }
404 if (end <= new->bse_f_offset) {
405 kfree(new);
406 return 0;
407 }
408 new->bse_length = end - new->bse_f_offset;
409 new->bse_devid = be->be_devid;
410 new->bse_mdev = be->be_mdev;
411
412 spin_lock(&bl->bl_ext_lock);
413 add_to_commitlist(bl, new);
414 spin_unlock(&bl->bl_ext_lock);
415 return 0;
416}
417
418static void print_bl_extent(struct pnfs_block_extent *be)
419{
420 dprintk("PRINT EXTENT extent %p\n", be);
421 if (be) {
422 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
423 dprintk(" be_length %llu\n", (u64)be->be_length);
424 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
425 dprintk(" be_state %d\n", be->be_state);
426 }
427}
428
429static void
430destroy_extent(struct kref *kref)
431{
432 struct pnfs_block_extent *be;
433
434 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
435 dprintk("%s be=%p\n", __func__, be);
436 kfree(be);
437}
438
439void
440bl_put_extent(struct pnfs_block_extent *be)
441{
442 if (be) {
443 dprintk("%s enter %p (%i)\n", __func__, be,
444 atomic_read(&be->be_refcnt.refcount));
445 kref_put(&be->be_refcnt, destroy_extent);
446 }
447}
448
449struct pnfs_block_extent *bl_alloc_extent(void)
450{
451 struct pnfs_block_extent *be;
452
453 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
454 if (!be)
455 return NULL;
456 INIT_LIST_HEAD(&be->be_node);
457 kref_init(&be->be_refcnt);
458 be->be_inval = NULL;
459 return be;
460}
461
462static void print_elist(struct list_head *list)
463{
464 struct pnfs_block_extent *be;
465 dprintk("****************\n");
466 dprintk("Extent list looks like:\n");
467 list_for_each_entry(be, list, be_node) {
468 print_bl_extent(be);
469 }
470 dprintk("****************\n");
471}
472
473static inline int
474extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
475{
476 /* Note this assumes new->be_f_offset >= old->be_f_offset */
477 return (new->be_state == old->be_state) &&
478 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
479 ((new->be_v_offset - old->be_v_offset ==
480 new->be_f_offset - old->be_f_offset) &&
481 new->be_mdev == old->be_mdev));
482}
483
484/* Adds new to appropriate list in bl, modifying new and removing existing
485 * extents as appropriate to deal with overlaps.
486 *
487 * See bl_find_get_extent for list constraints.
488 *
489 * Refcount on new is already set. If end up not using it, or error out,
490 * need to put the reference.
491 *
492 * bl->bl_ext_lock is held by caller.
493 */
494int
495bl_add_merge_extent(struct pnfs_block_layout *bl,
496 struct pnfs_block_extent *new)
497{
498 struct pnfs_block_extent *be, *tmp;
499 sector_t end = new->be_f_offset + new->be_length;
500 struct list_head *list;
501
502 dprintk("%s enter with be=%p\n", __func__, new);
503 print_bl_extent(new);
504 list = &bl->bl_extents[bl_choose_list(new->be_state)];
505 print_elist(list);
506
507 /* Scan for proper place to insert, extending new to the left
508 * as much as possible.
509 */
510 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
511 if (new->be_f_offset >= be->be_f_offset + be->be_length)
512 break;
513 if (new->be_f_offset >= be->be_f_offset) {
514 if (end <= be->be_f_offset + be->be_length) {
515 /* new is a subset of existing be*/
516 if (extents_consistent(be, new)) {
517 dprintk("%s: new is subset, ignoring\n",
518 __func__);
519 bl_put_extent(new);
520 return 0;
521 } else {
522 goto out_err;
523 }
524 } else {
525 /* |<-- be -->|
526 * |<-- new -->| */
527 if (extents_consistent(be, new)) {
528 /* extend new to fully replace be */
529 new->be_length += new->be_f_offset -
530 be->be_f_offset;
531 new->be_f_offset = be->be_f_offset;
532 new->be_v_offset = be->be_v_offset;
533 dprintk("%s: removing %p\n", __func__, be);
534 list_del(&be->be_node);
535 bl_put_extent(be);
536 } else {
537 goto out_err;
538 }
539 }
540 } else if (end >= be->be_f_offset + be->be_length) {
541 /* new extent overlap existing be */
542 if (extents_consistent(be, new)) {
543 /* extend new to fully replace be */
544 dprintk("%s: removing %p\n", __func__, be);
545 list_del(&be->be_node);
546 bl_put_extent(be);
547 } else {
548 goto out_err;
549 }
550 } else if (end > be->be_f_offset) {
551 /* |<-- be -->|
552 *|<-- new -->| */
553 if (extents_consistent(new, be)) {
554 /* extend new to fully replace be */
555 new->be_length += be->be_f_offset + be->be_length -
556 new->be_f_offset - new->be_length;
557 dprintk("%s: removing %p\n", __func__, be);
558 list_del(&be->be_node);
559 bl_put_extent(be);
560 } else {
561 goto out_err;
562 }
563 }
564 }
565 /* Note that if we never hit the above break, be will not point to a
566 * valid extent. However, in that case &be->be_node==list.
567 */
568 list_add(&new->be_node, &be->be_node);
569 dprintk("%s: inserting new\n", __func__);
570 print_elist(list);
571 /* FIXME - The per-list consistency checks have all been done,
572 * should now check cross-list consistency.
573 */
574 return 0;
575
576 out_err:
577 bl_put_extent(new);
578 return -EIO;
579}
580
581/* Returns extent, or NULL. If a second READ extent exists, it is returned
582 * in cow_read, if given.
583 *
584 * The extents are kept in two seperate ordered lists, one for READ and NONE,
585 * one for READWRITE and INVALID. Within each list, we assume:
586 * 1. Extents are ordered by file offset.
587 * 2. For any given isect, there is at most one extents that matches.
588 */
589struct pnfs_block_extent *
590bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
591 struct pnfs_block_extent **cow_read)
592{
593 struct pnfs_block_extent *be, *cow, *ret;
594 int i;
595
596 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
597 cow = ret = NULL;
598 spin_lock(&bl->bl_ext_lock);
599 for (i = 0; i < EXTENT_LISTS; i++) {
600 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
601 if (isect >= be->be_f_offset + be->be_length)
602 break;
603 if (isect >= be->be_f_offset) {
604 /* We have found an extent */
605 dprintk("%s Get %p (%i)\n", __func__, be,
606 atomic_read(&be->be_refcnt.refcount));
607 kref_get(&be->be_refcnt);
608 if (!ret)
609 ret = be;
610 else if (be->be_state != PNFS_BLOCK_READ_DATA)
611 bl_put_extent(be);
612 else
613 cow = be;
614 break;
615 }
616 }
617 if (ret &&
618 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
619 break;
620 }
621 spin_unlock(&bl->bl_ext_lock);
622 if (cow_read)
623 *cow_read = cow;
624 print_bl_extent(ret);
625 return ret;
626}
627
628/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
629static struct pnfs_block_extent *
630bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
631{
632 struct pnfs_block_extent *be, *ret = NULL;
633 int i;
634
635 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
636 for (i = 0; i < EXTENT_LISTS; i++) {
637 if (ret)
638 break;
639 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
640 if (isect >= be->be_f_offset + be->be_length)
641 break;
642 if (isect >= be->be_f_offset) {
643 /* We have found an extent */
644 dprintk("%s Get %p (%i)\n", __func__, be,
645 atomic_read(&be->be_refcnt.refcount));
646 kref_get(&be->be_refcnt);
647 ret = be;
648 break;
649 }
650 }
651 }
652 print_bl_extent(ret);
653 return ret;
654}
655
656int
657encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
658 struct xdr_stream *xdr,
659 const struct nfs4_layoutcommit_args *arg)
660{
661 struct pnfs_block_short_extent *lce, *save;
662 unsigned int count = 0;
663 __be32 *p, *xdr_start;
664
665 dprintk("%s enter\n", __func__);
666 /* BUG - creation of bl_commit is buggy - need to wait for
667 * entire block to be marked WRITTEN before it can be added.
668 */
669 spin_lock(&bl->bl_ext_lock);
670 /* Want to adjust for possible truncate */
671 /* We now want to adjust argument range */
672
673 /* XDR encode the ranges found */
674 xdr_start = xdr_reserve_space(xdr, 8);
675 if (!xdr_start)
676 goto out;
677 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
678 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
679 if (!p)
680 break;
681 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
682 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 bl->bl_count--;
688 count++;
689 }
690 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
691 xdr_start[1] = cpu_to_be32(count);
692out:
693 spin_unlock(&bl->bl_ext_lock);
694 dprintk("%s found %i ranges\n", __func__, count);
695 return 0;
696}
697
698/* Helper function to set_to_rw that initialize a new extent */
699static void
700_prep_new_extent(struct pnfs_block_extent *new,
701 struct pnfs_block_extent *orig,
702 sector_t offset, sector_t length, int state)
703{
704 kref_init(&new->be_refcnt);
705 /* don't need to INIT_LIST_HEAD(&new->be_node) */
706 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
707 new->be_mdev = orig->be_mdev;
708 new->be_f_offset = offset;
709 new->be_length = length;
710 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
711 new->be_state = state;
712 new->be_inval = orig->be_inval;
713}
714
715/* Tries to merge be with extent in front of it in list.
716 * Frees storage if not used.
717 */
718static struct pnfs_block_extent *
719_front_merge(struct pnfs_block_extent *be, struct list_head *head,
720 struct pnfs_block_extent *storage)
721{
722 struct pnfs_block_extent *prev;
723
724 if (!storage)
725 goto no_merge;
726 if (&be->be_node == head || be->be_node.prev == head)
727 goto no_merge;
728 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
729 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
730 !extents_consistent(prev, be))
731 goto no_merge;
732 _prep_new_extent(storage, prev, prev->be_f_offset,
733 prev->be_length + be->be_length, prev->be_state);
734 list_replace(&prev->be_node, &storage->be_node);
735 bl_put_extent(prev);
736 list_del(&be->be_node);
737 bl_put_extent(be);
738 return storage;
739
740 no_merge:
741 kfree(storage);
742 return be;
743}
744
745static u64
746set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
747{
748 u64 rv = offset + length;
749 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
750 struct pnfs_block_extent *children[3];
751 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
752 int i = 0, j;
753
754 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
755 /* Create storage for up to three new extents e1, e2, e3 */
756 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
757 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
758 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
759 /* BUG - we are ignoring any failure */
760 if (!e1 || !e2 || !e3)
761 goto out_nosplit;
762
763 spin_lock(&bl->bl_ext_lock);
764 be = bl_find_get_extent_locked(bl, offset);
765 rv = be->be_f_offset + be->be_length;
766 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
767 spin_unlock(&bl->bl_ext_lock);
768 goto out_nosplit;
769 }
770 /* Add e* to children, bumping e*'s krefs */
771 if (be->be_f_offset != offset) {
772 _prep_new_extent(e1, be, be->be_f_offset,
773 offset - be->be_f_offset,
774 PNFS_BLOCK_INVALID_DATA);
775 children[i++] = e1;
776 print_bl_extent(e1);
777 } else
778 merge1 = e1;
779 _prep_new_extent(e2, be, offset,
780 min(length, be->be_f_offset + be->be_length - offset),
781 PNFS_BLOCK_READWRITE_DATA);
782 children[i++] = e2;
783 print_bl_extent(e2);
784 if (offset + length < be->be_f_offset + be->be_length) {
785 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
786 be->be_f_offset + be->be_length -
787 offset - length,
788 PNFS_BLOCK_INVALID_DATA);
789 children[i++] = e3;
790 print_bl_extent(e3);
791 } else
792 merge2 = e3;
793
794 /* Remove be from list, and insert the e* */
795 /* We don't get refs on e*, since this list is the base reference
796 * set when init'ed.
797 */
798 if (i < 3)
799 children[i] = NULL;
800 new = children[0];
801 list_replace(&be->be_node, &new->be_node);
802 bl_put_extent(be);
803 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
804 for (j = 1; j < i; j++) {
805 old = new;
806 new = children[j];
807 list_add(&new->be_node, &old->be_node);
808 }
809 if (merge2) {
810 /* This is a HACK, should just create a _back_merge function */
811 new = list_entry(new->be_node.next,
812 struct pnfs_block_extent, be_node);
813 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
814 }
815 spin_unlock(&bl->bl_ext_lock);
816
817 /* Since we removed the base reference above, be is now scheduled for
818 * destruction.
819 */
820 bl_put_extent(be);
821 dprintk("%s returns %llu after split\n", __func__, rv);
822 return rv;
823
824 out_nosplit:
825 kfree(e1);
826 kfree(e2);
827 kfree(e3);
828 dprintk("%s returns %llu without splitting\n", __func__, rv);
829 return rv;
830}
831
832void
833clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
834 const struct nfs4_layoutcommit_args *arg,
835 int status)
836{
837 struct pnfs_block_short_extent *lce, *save;
838
839 dprintk("%s status %d\n", __func__, status);
840 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
841 if (likely(!status)) {
842 u64 offset = lce->bse_f_offset;
843 u64 end = offset + lce->bse_length;
844
845 do {
846 offset = set_to_rw(bl, offset, end - offset);
847 } while (offset < end);
848 list_del(&lce->bse_node);
849
850 kfree(lce);
851 } else {
852 list_del(&lce->bse_node);
853 spin_lock(&bl->bl_ext_lock);
854 add_to_commitlist(bl, lce);
855 spin_unlock(&bl->bl_ext_lock);
856 }
857 }
858}
859
860int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
861{
862 struct pnfs_block_short_extent *new;
863
864 new = kmalloc(sizeof(*new), GFP_NOFS);
865 if (unlikely(!new))
866 return -ENOMEM;
867
868 spin_lock_bh(&marks->im_lock);
869 list_add(&new->bse_node, &marks->im_extents);
870 spin_unlock_bh(&marks->im_lock);
871
872 return 0;
873}
874
875struct pnfs_block_short_extent *
876bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
877{
878 struct pnfs_block_short_extent *rv = NULL;
879
880 spin_lock_bh(&marks->im_lock);
881 if (!list_empty(&marks->im_extents)) {
882 rv = list_entry((&marks->im_extents)->next,
883 struct pnfs_block_short_extent, bse_node);
884 list_del_init(&rv->bse_node);
885 }
886 spin_unlock_bh(&marks->im_lock);
887
888 return rv;
889}
890
891void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
892{
893 struct pnfs_block_short_extent *se = NULL, *tmp;
894
895 if (num_to_free <= 0)
896 return;
897
898 spin_lock(&marks->im_lock);
899 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
900 list_del(&se->bse_node);
901 kfree(se);
902 if (--num_to_free == 0)
903 break;
904 }
905 spin_unlock(&marks->im_lock);
906
907 BUG_ON(num_to_free > 0);
908}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..8d04bda2bd2e
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,285 @@
1/*
2 * Copyright (c) 2006,2007 The Regents of the University of Michigan.
3 * All rights reserved.
4 *
5 * Andy Adamson <andros@citi.umich.edu>
6 * Fred Isaman <iisaman@umich.edu>
7 *
8 * permission is granted to use, copy, create derivative works and
9 * redistribute this software and such derivative works for any purpose,
10 * so long as the name of the university of michigan is not used in
11 * any advertising or publicity pertaining to the use or distribution
12 * of this software without specific, written prior authorization. if
13 * the above copyright notice or any other identification of the
14 * university of michigan is included in any copy of any portion of
15 * this software, then the disclaimer below must also be included.
16 *
17 * this software is provided as is, without representation from the
18 * university of michigan as to its fitness for any purpose, and without
19 * warranty by the university of michigan of any kind, either express
20 * or implied, including without limitation the implied warranties of
21 * merchantability and fitness for a particular purpose. the regents
22 * of the university of michigan shall not be liable for any damages,
23 * including special, indirect, incidental, or consequential damages,
24 * with respect to any claim arising out or in connection with the use
25 * of the software, even if it has been or is hereafter advised of the
26 * possibility of such damages.
27 */
28
29#include <linux/module.h>
30#include <linux/genhd.h>
31#include <linux/blkdev.h>
32
33#include "blocklayout.h"
34
35#define NFSDBG_FACILITY NFSDBG_PNFS_LD
36
37static void
38nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
39{
40 int i;
41
42 *p++ = cpu_to_be32(1);
43 *p++ = cpu_to_be32(b->type);
44 *p++ = cpu_to_be32(b->simple.nr_sigs);
45 for (i = 0; i < b->simple.nr_sigs; i++) {
46 p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
47 p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
48 b->simple.sigs[i].sig_len);
49 }
50}
51
52dev_t
53bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
54 gfp_t gfp_mask)
55{
56 struct net *net = server->nfs_client->cl_net;
57 struct nfs_net *nn = net_generic(net, nfs_net_id);
58 struct bl_dev_msg *reply = &nn->bl_mount_reply;
59 struct bl_pipe_msg bl_pipe_msg;
60 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
61 struct bl_msg_hdr *bl_msg;
62 DECLARE_WAITQUEUE(wq, current);
63 dev_t dev = 0;
64 int rc;
65
66 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
67
68 bl_pipe_msg.bl_wq = &nn->bl_wq;
69
70 b->simple.len += 4; /* single volume */
71 if (b->simple.len > PAGE_SIZE)
72 return -EIO;
73
74 memset(msg, 0, sizeof(*msg));
75 msg->len = sizeof(*bl_msg) + b->simple.len;
76 msg->data = kzalloc(msg->len, gfp_mask);
77 if (!msg->data)
78 goto out;
79
80 bl_msg = msg->data;
81 bl_msg->type = BL_DEVICE_MOUNT,
82 bl_msg->totallen = b->simple.len;
83 nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
84
85 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
86 add_wait_queue(&nn->bl_wq, &wq);
87 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
88 if (rc < 0) {
89 remove_wait_queue(&nn->bl_wq, &wq);
90 goto out;
91 }
92
93 set_current_state(TASK_UNINTERRUPTIBLE);
94 schedule();
95 __set_current_state(TASK_RUNNING);
96 remove_wait_queue(&nn->bl_wq, &wq);
97
98 if (reply->status != BL_DEVICE_REQUEST_PROC) {
99 printk(KERN_WARNING "%s failed to decode device: %d\n",
100 __func__, reply->status);
101 goto out;
102 }
103
104 dev = MKDEV(reply->major, reply->minor);
105out:
106 kfree(msg->data);
107 return dev;
108}
109
110static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
111 size_t mlen)
112{
113 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
114 nfs_net_id);
115
116 if (mlen != sizeof (struct bl_dev_msg))
117 return -EINVAL;
118
119 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
120 return -EFAULT;
121
122 wake_up(&nn->bl_wq);
123
124 return mlen;
125}
126
127static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
128{
129 struct bl_pipe_msg *bl_pipe_msg =
130 container_of(msg, struct bl_pipe_msg, msg);
131
132 if (msg->errno >= 0)
133 return;
134 wake_up(bl_pipe_msg->bl_wq);
135}
136
137static const struct rpc_pipe_ops bl_upcall_ops = {
138 .upcall = rpc_pipe_generic_upcall,
139 .downcall = bl_pipe_downcall,
140 .destroy_msg = bl_pipe_destroy_msg,
141};
142
143static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
144 struct rpc_pipe *pipe)
145{
146 struct dentry *dir, *dentry;
147
148 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
149 if (dir == NULL)
150 return ERR_PTR(-ENOENT);
151 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
152 dput(dir);
153 return dentry;
154}
155
156static void nfs4blocklayout_unregister_sb(struct super_block *sb,
157 struct rpc_pipe *pipe)
158{
159 if (pipe->dentry)
160 rpc_unlink(pipe->dentry);
161}
162
163static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
164 void *ptr)
165{
166 struct super_block *sb = ptr;
167 struct net *net = sb->s_fs_info;
168 struct nfs_net *nn = net_generic(net, nfs_net_id);
169 struct dentry *dentry;
170 int ret = 0;
171
172 if (!try_module_get(THIS_MODULE))
173 return 0;
174
175 if (nn->bl_device_pipe == NULL) {
176 module_put(THIS_MODULE);
177 return 0;
178 }
179
180 switch (event) {
181 case RPC_PIPEFS_MOUNT:
182 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
183 if (IS_ERR(dentry)) {
184 ret = PTR_ERR(dentry);
185 break;
186 }
187 nn->bl_device_pipe->dentry = dentry;
188 break;
189 case RPC_PIPEFS_UMOUNT:
190 if (nn->bl_device_pipe->dentry)
191 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
192 break;
193 default:
194 ret = -ENOTSUPP;
195 break;
196 }
197 module_put(THIS_MODULE);
198 return ret;
199}
200
201static struct notifier_block nfs4blocklayout_block = {
202 .notifier_call = rpc_pipefs_event,
203};
204
205static struct dentry *nfs4blocklayout_register_net(struct net *net,
206 struct rpc_pipe *pipe)
207{
208 struct super_block *pipefs_sb;
209 struct dentry *dentry;
210
211 pipefs_sb = rpc_get_sb_net(net);
212 if (!pipefs_sb)
213 return NULL;
214 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
215 rpc_put_sb_net(net);
216 return dentry;
217}
218
219static void nfs4blocklayout_unregister_net(struct net *net,
220 struct rpc_pipe *pipe)
221{
222 struct super_block *pipefs_sb;
223
224 pipefs_sb = rpc_get_sb_net(net);
225 if (pipefs_sb) {
226 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
227 rpc_put_sb_net(net);
228 }
229}
230
231static int nfs4blocklayout_net_init(struct net *net)
232{
233 struct nfs_net *nn = net_generic(net, nfs_net_id);
234 struct dentry *dentry;
235
236 init_waitqueue_head(&nn->bl_wq);
237 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
238 if (IS_ERR(nn->bl_device_pipe))
239 return PTR_ERR(nn->bl_device_pipe);
240 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
241 if (IS_ERR(dentry)) {
242 rpc_destroy_pipe_data(nn->bl_device_pipe);
243 return PTR_ERR(dentry);
244 }
245 nn->bl_device_pipe->dentry = dentry;
246 return 0;
247}
248
249static void nfs4blocklayout_net_exit(struct net *net)
250{
251 struct nfs_net *nn = net_generic(net, nfs_net_id);
252
253 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
254 rpc_destroy_pipe_data(nn->bl_device_pipe);
255 nn->bl_device_pipe = NULL;
256}
257
258static struct pernet_operations nfs4blocklayout_net_ops = {
259 .init = nfs4blocklayout_net_init,
260 .exit = nfs4blocklayout_net_exit,
261};
262
263int __init bl_init_pipefs(void)
264{
265 int ret;
266
267 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
268 if (ret)
269 goto out;
270 ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
271 if (ret)
272 goto out_unregister_notifier;
273 return 0;
274
275out_unregister_notifier:
276 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
277out:
278 return ret;
279}
280
281void __exit bl_cleanup_pipefs(void)
282{
283 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
284 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
285}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
171 goto out; 171 goto out;
172 172
173 ino = lo->plh_inode; 173 ino = lo->plh_inode;
174
175 spin_lock(&ino->i_lock);
176 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
177 spin_unlock(&ino->i_lock);
178
179 pnfs_layoutcommit_inode(ino, false);
180
174 spin_lock(&ino->i_lock); 181 spin_lock(&ino->i_lock);
175 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 182 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
176 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, 183 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
177 &args->cbl_range)) 184 &args->cbl_range)) {
178 rv = NFS4ERR_DELAY; 185 rv = NFS4ERR_DELAY;
179 else 186 goto unlock;
180 rv = NFS4ERR_NOMATCHING_LAYOUT; 187 }
181 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 188
189 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
190 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
191 &args->cbl_range);
192 }
193unlock:
182 spin_unlock(&ino->i_lock); 194 spin_unlock(&ino->i_lock);
183 pnfs_free_lseg_list(&free_me_list); 195 pnfs_free_lseg_list(&free_me_list);
184 pnfs_put_layout_hdr(lo); 196 pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
277 } 289 }
278 290
279 found: 291 found:
280 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
281 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
282 "deleting instead\n", __func__);
283 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); 292 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
284 } 293 }
285 294
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 94088517039f..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1252 * set up the iterator to start reading from the server list and return the first item 1252 * set up the iterator to start reading from the server list and return the first item
1253 */ 1253 */
1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1255 __acquires(&nn->nfs_client_lock)
1255{ 1256{
1256 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1257 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1257 1258
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1274 * clean up after reading from the transports list 1275 * clean up after reading from the transports list
1275 */ 1276 */
1276static void nfs_server_list_stop(struct seq_file *p, void *v) 1277static void nfs_server_list_stop(struct seq_file *p, void *v)
1278 __releases(&nn->nfs_client_lock)
1277{ 1279{
1278 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1280 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1279 1281
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1326 * set up the iterator to start reading from the volume list and return the first item 1328 * set up the iterator to start reading from the volume list and return the first item
1327 */ 1329 */
1328static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1330static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1331 __acquires(&nn->nfs_client_lock)
1329{ 1332{
1330 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1333 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1331 1334
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1348 * clean up after reading from the transports list 1351 * clean up after reading from the transports list
1349 */ 1352 */
1350static void nfs_volume_list_stop(struct seq_file *p, void *v) 1353static void nfs_volume_list_stop(struct seq_file *p, void *v)
1354 __releases(&nn->nfs_client_lock)
1351{ 1355{
1352 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1356 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1353 1357
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
179} 179}
180 180
181#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
182/* 181/*
183 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data 182 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
184 * @dreq - direct request possibly spanning multiple servers 183 * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
197 WARN_ON_ONCE(verfp->committed < 0); 196 WARN_ON_ONCE(verfp->committed < 0);
198 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
199} 198}
200#endif
201 199
202/** 200/**
203 * nfs_direct_IO - NFS address space operation for direct I/O 201 * nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
576 return result; 574 return result;
577} 575}
578 576
579#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
580static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 577static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
581{ 578{
582 struct nfs_pageio_descriptor desc; 579 struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
700 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ 697 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
701} 698}
702 699
703#else
704static void nfs_direct_write_schedule_work(struct work_struct *work)
705{
706}
707
708static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
709{
710 nfs_direct_complete(dreq, true);
711}
712#endif
713
714static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 700static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
715{ 701{
716 struct nfs_direct_req *dreq = hdr->dreq; 702 struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d1898..6920127c5eb7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#include "nfstrace.h" 41#include "nfstrace.h"
41 42
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
327 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); 328 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
328 unsigned int end = offset + len; 329 unsigned int end = offset + len;
329 330
331 if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
332 if (!PageUptodate(page))
333 return 1;
334 return 0;
335 }
336
330 if ((file->f_mode & FMODE_READ) && /* open for read? */ 337 if ((file->f_mode & FMODE_READ) && /* open for read? */
331 !PageUptodate(page) && /* Uptodate? */ 338 !PageUptodate(page) && /* Uptodate? */
332 !PagePrivate(page) && /* i/o request already? */ 339 !PagePrivate(page) && /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
468 475
469 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 476 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
470 477
471 /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not 478 /* Always try to initiate a 'commit' if relevant, but only
472 * doing this memory reclaim for a fs-related allocation. 479 * wait for it if __GFP_WAIT is set. Even then, only wait 1
480 * second and only if the 'bdi' is not congested.
481 * Waiting indefinitely can cause deadlocks when the NFS
482 * server is on this machine, when a new TCP connection is
483 * needed and in other rare cases. There is no particular
484 * need to wait extensively here. A short wait has the
485 * benefit that someone else can worry about the freezer.
473 */ 486 */
474 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && 487 if (mapping) {
475 !(current->flags & PF_FSTRANS)) { 488 struct nfs_server *nfss = NFS_SERVER(mapping->host);
476 int how = FLUSH_SYNC; 489 nfs_commit_inode(mapping->host, 0);
477 490 if ((gfp & __GFP_WAIT) &&
478 /* Don't let kswapd deadlock waiting for OOM RPC calls */ 491 !bdi_write_congested(&nfss->backing_dev_info)) {
479 if (current_is_kswapd()) 492 wait_on_page_bit_killable_timeout(page, PG_private,
480 how = 0; 493 HZ);
481 nfs_commit_inode(mapping->host, how); 494 if (PagePrivate(page))
495 set_bdi_congested(&nfss->backing_dev_info,
496 BLK_RW_ASYNC);
497 }
482 } 498 }
483 /* If PagePrivate() is set, then the page is not freeable */ 499 /* If PagePrivate() is set, then the page is not freeable */
484 if (PagePrivate(page)) 500 if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
539static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, 555static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
540 sector_t *span) 556 sector_t *span)
541{ 557{
558 int ret;
559 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
560
542 *span = sis->pages; 561 *span = sis->pages;
543 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); 562
563 rcu_read_lock();
564 ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
565 rcu_read_unlock();
566
567 return ret;
544} 568}
545 569
546static void nfs_swap_deactivate(struct file *file) 570static void nfs_swap_deactivate(struct file *file)
547{ 571{
548 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); 572 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
573
574 rcu_read_lock();
575 xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
576 rcu_read_unlock();
549} 577}
550#endif 578#endif
551 579
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 90978075f730..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
265{ 265{
266 266
267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
268 hdr->res.verf->committed == NFS_FILE_SYNC) 268 hdr->res.verf->committed != NFS_DATA_SYNC)
269 return; 269 return;
270 270
271 pnfs_set_layoutcommit(hdr); 271 pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
403 return -EAGAIN; 403 return -EAGAIN;
404 } 404 }
405 405
406 if (data->verf.committed == NFS_UNSTABLE)
407 pnfs_commit_set_layoutcommit(data);
408
406 return 0; 409 return 0;
407} 410}
408 411
@@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
646 } 649 }
647 650
648 /* find and reference the deviceid */ 651 /* find and reference the deviceid */
649 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 652 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
650 NFS_SERVER(lo->plh_inode)->nfs_client, id); 653 lo->plh_lc_cred, gfp_flags);
651 if (d == NULL) { 654 if (d == NULL)
652 dsaddr = filelayout_get_device_info(lo->plh_inode, id, 655 goto out;
653 lo->plh_lc_cred, gfp_flags); 656
654 if (dsaddr == NULL) 657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
655 goto out;
656 } else
657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
658 /* Found deviceid is unavailable */ 658 /* Found deviceid is unavailable */
659 if (filelayout_test_devid_unavailable(&dsaddr->id_node)) 659 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
660 goto out_put; 660 goto out_put;
661 661
662 fl->dsaddr = dsaddr; 662 fl->dsaddr = dsaddr;
663 663
@@ -1368,6 +1368,17 @@ out:
1368 cinfo->ds->ncommitting = 0; 1368 cinfo->ds->ncommitting = 0;
1369 return PNFS_ATTEMPTED; 1369 return PNFS_ATTEMPTED;
1370} 1370}
1371static struct nfs4_deviceid_node *
1372filelayout_alloc_deviceid_node(struct nfs_server *server,
1373 struct pnfs_device *pdev, gfp_t gfp_flags)
1374{
1375 struct nfs4_file_layout_dsaddr *dsaddr;
1376
1377 dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
1378 if (!dsaddr)
1379 return NULL;
1380 return &dsaddr->id_node;
1381}
1371 1382
1372static void 1383static void
1373filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) 1384filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1420,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1420 .commit_pagelist = filelayout_commit_pagelist, 1431 .commit_pagelist = filelayout_commit_pagelist,
1421 .read_pagelist = filelayout_read_pagelist, 1432 .read_pagelist = filelayout_read_pagelist,
1422 .write_pagelist = filelayout_write_pagelist, 1433 .write_pagelist = filelayout_write_pagelist,
1434 .alloc_deviceid_node = filelayout_alloc_deviceid_node,
1423 .free_deviceid_node = filelayout_free_deveiceid_node, 1435 .free_deviceid_node = filelayout_free_deveiceid_node,
1424}; 1436};
1425 1437
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
149 u32 ds_idx); 149 u32 ds_idx);
150
151extern struct nfs4_file_layout_dsaddr *
152nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
153 struct pnfs_device *pdev, gfp_t gfp_flags);
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 154extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 155extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
155 156
156#endif /* FS_NFS_NFS4FILELAYOUT_H */ 157#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
484} 484}
485 485
486/* Decode opaque device data and return the result */ 486/* Decode opaque device data and return the result */
487static struct nfs4_file_layout_dsaddr* 487struct nfs4_file_layout_dsaddr *
488decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) 488nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
489 gfp_t gfp_flags)
489{ 490{
490 int i; 491 int i;
491 u32 cnt, num; 492 u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
570 dsaddr->stripe_indices = stripe_indices; 571 dsaddr->stripe_indices = stripe_indices;
571 stripe_indices = NULL; 572 stripe_indices = NULL;
572 dsaddr->ds_num = num; 573 dsaddr->ds_num = num;
573 nfs4_init_deviceid_node(&dsaddr->id_node, 574 nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
574 NFS_SERVER(ino)->pnfs_curr_ld,
575 NFS_SERVER(ino)->nfs_client,
576 &pdev->dev_id);
577 575
578 INIT_LIST_HEAD(&dsaddrs); 576 INIT_LIST_HEAD(&dsaddrs);
579 577
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
587 585
588 mp_count = be32_to_cpup(p); /* multipath count */ 586 mp_count = be32_to_cpup(p); /* multipath count */
589 for (j = 0; j < mp_count; j++) { 587 for (j = 0; j < mp_count; j++) {
590 da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, 588 da = decode_ds_addr(server->nfs_client->cl_net,
591 &stream, gfp_flags); 589 &stream, gfp_flags);
592 if (da) 590 if (da)
593 list_add_tail(&da->da_node, &dsaddrs); 591 list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
637 return NULL; 635 return NULL;
638} 636}
639 637
640/*
641 * Decode the opaque device specified in 'dev' and add it to the cache of
642 * available devices.
643 */
644static struct nfs4_file_layout_dsaddr *
645decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
646{
647 struct nfs4_deviceid_node *d;
648 struct nfs4_file_layout_dsaddr *n, *new;
649
650 new = decode_device(inode, dev, gfp_flags);
651 if (!new) {
652 printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
653 __func__);
654 return NULL;
655 }
656
657 d = nfs4_insert_deviceid_node(&new->id_node);
658 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
659 if (n != new) {
660 nfs4_fl_free_deviceid(new);
661 return n;
662 }
663
664 return new;
665}
666
667/*
668 * Retrieve the information for dev_id, add it to the list
669 * of available devices, and return it.
670 */
671struct nfs4_file_layout_dsaddr *
672filelayout_get_device_info(struct inode *inode,
673 struct nfs4_deviceid *dev_id,
674 struct rpc_cred *cred,
675 gfp_t gfp_flags)
676{
677 struct pnfs_device *pdev = NULL;
678 u32 max_resp_sz;
679 int max_pages;
680 struct page **pages = NULL;
681 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
682 int rc, i;
683 struct nfs_server *server = NFS_SERVER(inode);
684
685 /*
686 * Use the session max response size as the basis for setting
687 * GETDEVICEINFO's maxcount
688 */
689 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
690 max_pages = nfs_page_array_len(0, max_resp_sz);
691 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
692 __func__, inode, max_resp_sz, max_pages);
693
694 pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
695 if (pdev == NULL)
696 return NULL;
697
698 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
699 if (pages == NULL) {
700 kfree(pdev);
701 return NULL;
702 }
703 for (i = 0; i < max_pages; i++) {
704 pages[i] = alloc_page(gfp_flags);
705 if (!pages[i])
706 goto out_free;
707 }
708
709 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
710 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
711 pdev->pages = pages;
712 pdev->pgbase = 0;
713 pdev->pglen = max_resp_sz;
714 pdev->mincount = 0;
715 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
716
717 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
718 dprintk("%s getdevice info returns %d\n", __func__, rc);
719 if (rc)
720 goto out_free;
721
722 /*
723 * Found new device, need to decode it and then add it to the
724 * list of known devices for this mountpoint.
725 */
726 dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
727out_free:
728 for (i = 0; i < max_pages; i++)
729 __free_page(pages[i]);
730 kfree(pages);
731 kfree(pdev);
732 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
733 return dsaddr;
734}
735
736void 638void
737nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 639nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
738{ 640{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
505 attr->ia_valid &= ~ATTR_MODE; 505 attr->ia_valid &= ~ATTR_MODE;
506 506
507 if (attr->ia_valid & ATTR_SIZE) { 507 if (attr->ia_valid & ATTR_SIZE) {
508 if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) 508 BUG_ON(!S_ISREG(inode->i_mode));
509
510 if (attr->ia_size == i_size_read(inode))
509 attr->ia_valid &= ~ATTR_SIZE; 511 attr->ia_valid &= ~ATTR_SIZE;
510 } 512 }
511 513
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9056622d2230..14ae6f20a172 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); 218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
219#endif 219#endif
220 220
221/* nfs3client.c */
222#if IS_ENABLED(CONFIG_NFS_V3)
223struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
224struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
225 struct nfs_fattr *, rpc_authflavor_t);
226#endif
227
228/* callback_xdr.c */ 221/* callback_xdr.c */
229extern struct svc_version nfs4_callback_version1; 222extern struct svc_version nfs4_callback_version1;
230extern struct svc_version nfs4_callback_version4; 223extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) 2014 Anna Schumaker.
3 *
4 * NFSv3-specific filesystem definitions and declarations
5 */
6#ifndef __LINUX_FS_NFS_NFS3_FS_H
7#define __LINUX_FS_NFS_NFS3_FS_H
8
9/*
10 * nfs3acl.c
11 */
12#ifdef CONFIG_NFS_V3_ACL
13extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
14extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
15extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
16 struct posix_acl *dfacl);
17extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
18extern const struct xattr_handler *nfs3_xattr_handlers[];
19#else
20static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
21 struct posix_acl *dfacl)
22{
23 return 0;
24}
25#define nfs3_listxattr NULL
26#endif /* CONFIG_NFS_V3_ACL */
27
28/* nfs3client.c */
29struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t);
32
33
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 24c6898159cc..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
7#include <linux/nfsacl.h> 7#include <linux/nfsacl.h>
8 8
9#include "internal.h" 9#include "internal.h"
10#include "nfs3_fs.h"
10 11
11#define NFSDBG_FACILITY NFSDBG_PROC 12#define NFSDBG_FACILITY NFSDBG_PROC
12 13
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include "internal.h" 3#include "internal.h"
4#include "nfs3_fs.h"
4 5
5#ifdef CONFIG_NFS_V3_ACL 6#ifdef CONFIG_NFS_V3_ACL
6static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; 7static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
22 22
23#include "iostat.h" 23#include "iostat.h"
24#include "internal.h" 24#include "internal.h"
25#include "nfs3_fs.h"
25 26
26#define NFSDBG_FACILITY NFSDBG_PROC 27#define NFSDBG_FACILITY NFSDBG_PROC
27 28
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
6#include "internal.h" 6#include "internal.h"
7#include "nfs3_fs.h"
7#include "nfs.h" 8#include "nfs.h"
8 9
9static struct nfs_subversion nfs_v3 = { 10static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0422d77b73c7..5aa55c132aa2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
77static int _nfs4_proc_open(struct nfs4_opendata *data); 77static int _nfs4_proc_open(struct nfs4_opendata *data);
78static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 78static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); 82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); 83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
314 kunmap_atomic(start); 314 kunmap_atomic(start);
315} 315}
316 316
317static long nfs4_update_delay(long *timeout)
318{
319 long ret;
320 if (!timeout)
321 return NFS4_POLL_RETRY_MAX;
322 if (*timeout <= 0)
323 *timeout = NFS4_POLL_RETRY_MIN;
324 if (*timeout > NFS4_POLL_RETRY_MAX)
325 *timeout = NFS4_POLL_RETRY_MAX;
326 ret = *timeout;
327 *timeout <<= 1;
328 return ret;
329}
330
317static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) 331static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
318{ 332{
319 int res = 0; 333 int res = 0;
320 334
321 might_sleep(); 335 might_sleep();
322 336
323 if (*timeout <= 0) 337 freezable_schedule_timeout_killable_unsafe(
324 *timeout = NFS4_POLL_RETRY_MIN; 338 nfs4_update_delay(timeout));
325 if (*timeout > NFS4_POLL_RETRY_MAX)
326 *timeout = NFS4_POLL_RETRY_MAX;
327 freezable_schedule_timeout_killable_unsafe(*timeout);
328 if (fatal_signal_pending(current)) 339 if (fatal_signal_pending(current))
329 res = -ERESTARTSYS; 340 res = -ERESTARTSYS;
330 *timeout <<= 1;
331 return res; 341 return res;
332} 342}
333 343
@@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1307 int ret = -EAGAIN; 1317 int ret = -EAGAIN;
1308 1318
1309 for (;;) { 1319 for (;;) {
1320 spin_lock(&state->owner->so_lock);
1310 if (can_open_cached(state, fmode, open_mode)) { 1321 if (can_open_cached(state, fmode, open_mode)) {
1311 spin_lock(&state->owner->so_lock); 1322 update_open_stateflags(state, fmode);
1312 if (can_open_cached(state, fmode, open_mode)) {
1313 update_open_stateflags(state, fmode);
1314 spin_unlock(&state->owner->so_lock);
1315 goto out_return_state;
1316 }
1317 spin_unlock(&state->owner->so_lock); 1323 spin_unlock(&state->owner->so_lock);
1324 goto out_return_state;
1318 } 1325 }
1326 spin_unlock(&state->owner->so_lock);
1319 rcu_read_lock(); 1327 rcu_read_lock();
1320 delegation = rcu_dereference(nfsi->delegation); 1328 delegation = rcu_dereference(nfsi->delegation);
1321 if (!can_open_delegated(delegation, fmode)) { 1329 if (!can_open_delegated(delegation, fmode)) {
@@ -2589,7 +2597,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2589 if (calldata->arg.fmode == 0) 2597 if (calldata->arg.fmode == 0)
2590 break; 2598 break;
2591 default: 2599 default:
2592 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { 2600 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
2593 rpc_restart_call_prepare(task); 2601 rpc_restart_call_prepare(task);
2594 goto out_release; 2602 goto out_release;
2595 } 2603 }
@@ -3217,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
3217 struct nfs4_label *label = NULL; 3225 struct nfs4_label *label = NULL;
3218 int status; 3226 int status;
3219 3227
3220 if (pnfs_ld_layoutret_on_setattr(inode)) 3228 if (pnfs_ld_layoutret_on_setattr(inode) &&
3229 sattr->ia_valid & ATTR_SIZE &&
3230 sattr->ia_size < i_size_read(inode))
3221 pnfs_commit_and_return_layout(inode); 3231 pnfs_commit_and_return_layout(inode);
3222 3232
3223 nfs_fattr_init(fattr); 3233 nfs_fattr_init(fattr);
@@ -3576,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
3576 3586
3577 if (!nfs4_sequence_done(task, &res->seq_res)) 3587 if (!nfs4_sequence_done(task, &res->seq_res))
3578 return 0; 3588 return 0;
3579 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3589 if (nfs4_async_handle_error(task, res->server, NULL,
3590 &data->timeout) == -EAGAIN)
3580 return 0; 3591 return 0;
3581 update_changeattr(dir, &res->cinfo); 3592 update_changeattr(dir, &res->cinfo);
3582 return 1; 3593 return 1;
@@ -3609,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3609 3620
3610 if (!nfs4_sequence_done(task, &res->seq_res)) 3621 if (!nfs4_sequence_done(task, &res->seq_res))
3611 return 0; 3622 return 0;
3612 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3623 if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
3613 return 0; 3624 return 0;
3614 3625
3615 update_changeattr(old_dir, &res->old_cinfo); 3626 update_changeattr(old_dir, &res->old_cinfo);
@@ -4113,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
4113 4124
4114 trace_nfs4_read(hdr, task->tk_status); 4125 trace_nfs4_read(hdr, task->tk_status);
4115 if (nfs4_async_handle_error(task, server, 4126 if (nfs4_async_handle_error(task, server,
4116 hdr->args.context->state) == -EAGAIN) { 4127 hdr->args.context->state,
4128 NULL) == -EAGAIN) {
4117 rpc_restart_call_prepare(task); 4129 rpc_restart_call_prepare(task);
4118 return -EAGAIN; 4130 return -EAGAIN;
4119 } 4131 }
@@ -4181,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
4181 struct nfs_pgio_header *hdr) 4193 struct nfs_pgio_header *hdr)
4182{ 4194{
4183 struct inode *inode = hdr->inode; 4195 struct inode *inode = hdr->inode;
4184 4196
4185 trace_nfs4_write(hdr, task->tk_status); 4197 trace_nfs4_write(hdr, task->tk_status);
4186 if (nfs4_async_handle_error(task, NFS_SERVER(inode), 4198 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4187 hdr->args.context->state) == -EAGAIN) { 4199 hdr->args.context->state,
4200 NULL) == -EAGAIN) {
4188 rpc_restart_call_prepare(task); 4201 rpc_restart_call_prepare(task);
4189 return -EAGAIN; 4202 return -EAGAIN;
4190 } 4203 }
@@ -4264,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
4264 struct inode *inode = data->inode; 4277 struct inode *inode = data->inode;
4265 4278
4266 trace_nfs4_commit(data, task->tk_status); 4279 trace_nfs4_commit(data, task->tk_status);
4267 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 4280 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4281 NULL, NULL) == -EAGAIN) {
4268 rpc_restart_call_prepare(task); 4282 rpc_restart_call_prepare(task);
4269 return -EAGAIN; 4283 return -EAGAIN;
4270 } 4284 }
@@ -4817,7 +4831,8 @@ out:
4817 4831
4818 4832
4819static int 4833static int
4820nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4834nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4835 struct nfs4_state *state, long *timeout)
4821{ 4836{
4822 struct nfs_client *clp = server->nfs_client; 4837 struct nfs_client *clp = server->nfs_client;
4823 4838
@@ -4867,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4867#endif /* CONFIG_NFS_V4_1 */ 4882#endif /* CONFIG_NFS_V4_1 */
4868 case -NFS4ERR_DELAY: 4883 case -NFS4ERR_DELAY:
4869 nfs_inc_server_stats(server, NFSIOS_DELAY); 4884 nfs_inc_server_stats(server, NFSIOS_DELAY);
4885 rpc_delay(task, nfs4_update_delay(timeout));
4886 goto restart_call;
4870 case -NFS4ERR_GRACE: 4887 case -NFS4ERR_GRACE:
4871 rpc_delay(task, NFS4_POLL_RETRY_MAX); 4888 rpc_delay(task, NFS4_POLL_RETRY_MAX);
4872 case -NFS4ERR_RETRY_UNCACHED_REP: 4889 case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5107,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5107 pnfs_roc_set_barrier(data->inode, data->roc_barrier); 5124 pnfs_roc_set_barrier(data->inode, data->roc_barrier);
5108 break; 5125 break;
5109 default: 5126 default:
5110 if (nfs4_async_handle_error(task, data->res.server, NULL) == 5127 if (nfs4_async_handle_error(task, data->res.server,
5111 -EAGAIN) { 5128 NULL, NULL) == -EAGAIN) {
5112 rpc_restart_call_prepare(task); 5129 rpc_restart_call_prepare(task);
5113 return; 5130 return;
5114 } 5131 }
@@ -5372,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5372 case -NFS4ERR_EXPIRED: 5389 case -NFS4ERR_EXPIRED:
5373 break; 5390 break;
5374 default: 5391 default:
5375 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 5392 if (nfs4_async_handle_error(task, calldata->server,
5393 NULL, NULL) == -EAGAIN)
5376 rpc_restart_call_prepare(task); 5394 rpc_restart_call_prepare(task);
5377 } 5395 }
5378 nfs_release_seqid(calldata->arg.seqid); 5396 nfs_release_seqid(calldata->arg.seqid);
@@ -5978,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5978 break; 5996 break;
5979 case -NFS4ERR_LEASE_MOVED: 5997 case -NFS4ERR_LEASE_MOVED:
5980 case -NFS4ERR_DELAY: 5998 case -NFS4ERR_DELAY:
5981 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) 5999 if (nfs4_async_handle_error(task, server,
6000 NULL, NULL) == -EAGAIN)
5982 rpc_restart_call_prepare(task); 6001 rpc_restart_call_prepare(task);
5983 } 6002 }
5984} 6003}
@@ -7583,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7583 } else { 7602 } else {
7584 LIST_HEAD(head); 7603 LIST_HEAD(head);
7585 7604
7605 /*
7606 * Mark the bad layout state as invalid, then retry
7607 * with the current stateid.
7608 */
7586 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); 7609 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
7587 spin_unlock(&inode->i_lock); 7610 spin_unlock(&inode->i_lock);
7588 /* Mark the bad layout state as invalid, then
7589 * retry using the open stateid. */
7590 pnfs_free_lseg_list(&head); 7611 pnfs_free_lseg_list(&head);
7612
7613 task->tk_status = 0;
7614 rpc_restart_call_prepare(task);
7591 } 7615 }
7592 } 7616 }
7593 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) 7617 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
7594 rpc_restart_call_prepare(task); 7618 rpc_restart_call_prepare(task);
7595out: 7619out:
7596 dprintk("<-- %s\n", __func__); 7620 dprintk("<-- %s\n", __func__);
@@ -7750,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
7750 case 0: 7774 case 0:
7751 break; 7775 break;
7752 case -NFS4ERR_DELAY: 7776 case -NFS4ERR_DELAY:
7753 if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) 7777 if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
7754 break; 7778 break;
7755 rpc_restart_call_prepare(task); 7779 rpc_restart_call_prepare(task);
7756 return; 7780 return;
@@ -7809,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7809 return status; 7833 return status;
7810} 7834}
7811 7835
7812/*
7813 * Retrieve the list of Data Server devices from the MDS.
7814 */
7815static int _nfs4_getdevicelist(struct nfs_server *server,
7816 const struct nfs_fh *fh,
7817 struct pnfs_devicelist *devlist)
7818{
7819 struct nfs4_getdevicelist_args args = {
7820 .fh = fh,
7821 .layoutclass = server->pnfs_curr_ld->id,
7822 };
7823 struct nfs4_getdevicelist_res res = {
7824 .devlist = devlist,
7825 };
7826 struct rpc_message msg = {
7827 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
7828 .rpc_argp = &args,
7829 .rpc_resp = &res,
7830 };
7831 int status;
7832
7833 dprintk("--> %s\n", __func__);
7834 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
7835 &res.seq_res, 0);
7836 dprintk("<-- %s status=%d\n", __func__, status);
7837 return status;
7838}
7839
7840int nfs4_proc_getdevicelist(struct nfs_server *server,
7841 const struct nfs_fh *fh,
7842 struct pnfs_devicelist *devlist)
7843{
7844 struct nfs4_exception exception = { };
7845 int err;
7846
7847 do {
7848 err = nfs4_handle_exception(server,
7849 _nfs4_getdevicelist(server, fh, devlist),
7850 &exception);
7851 } while (exception.retry);
7852
7853 dprintk("%s: err=%d, num_devs=%u\n", __func__,
7854 err, devlist->num_devs);
7855
7856 return err;
7857}
7858EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
7859
7860static int 7836static int
7861_nfs4_proc_getdeviceinfo(struct nfs_server *server, 7837_nfs4_proc_getdeviceinfo(struct nfs_server *server,
7862 struct pnfs_device *pdev, 7838 struct pnfs_device *pdev,
@@ -7929,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
7929 case 0: 7905 case 0:
7930 break; 7906 break;
7931 default: 7907 default:
7932 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 7908 if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
7933 rpc_restart_call_prepare(task); 7909 rpc_restart_call_prepare(task);
7934 return; 7910 return;
7935 } 7911 }
@@ -8225,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
8225 8201
8226 switch (task->tk_status) { 8202 switch (task->tk_status) {
8227 case -NFS4ERR_DELAY: 8203 case -NFS4ERR_DELAY:
8228 if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) 8204 if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
8229 rpc_restart_call_prepare(task); 8205 rpc_restart_call_prepare(task);
8230 } 8206 }
8231} 8207}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 87b2d0e79797..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2345,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2345 status = nfs4_check_lease(clp); 2345 status = nfs4_check_lease(clp);
2346 if (status < 0) 2346 if (status < 0)
2347 goto out_error; 2347 goto out_error;
2348 continue;
2348 } 2349 }
2349 2350
2350 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { 2351 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
365#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ 365#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
366 encode_verifier_maxsz) 366 XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
367#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ 367 1 /* layout type */ + \
368 2 /* nfs_cookie4 gdlr_cookie */ + \ 368 1 /* maxcount */ + \
369 decode_verifier_maxsz \ 369 1 /* bitmap size */ + \
370 /* verifier4 gdlr_verifier */ + \ 370 1 /* notification bitmap length */ + \
371 1 /* gdlr_deviceid_list count */ + \ 371 1 /* notification bitmap, word 0 */)
372 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
373 NFS4_DEVICEID4_SIZE) \
374 /* gdlr_deviceid_list */ + \
375 1 /* bool gdlr_eof */)
376#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
377 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
378#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 372#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
379 1 /* layout type */ + \ 373 1 /* layout type */ + \
380 1 /* opaque devaddr4 length */ + \ 374 1 /* opaque devaddr4 length */ + \
381 /* devaddr4 payload is read into page */ \ 375 /* devaddr4 payload is read into page */ \
382 1 /* notification bitmap length */ + \ 376 1 /* notification bitmap length */ + \
383 1 /* notification bitmap */) 377 1 /* notification bitmap, word 0 */)
384#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ 378#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
385 encode_stateid_maxsz) 379 encode_stateid_maxsz)
386#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ 380#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
395 2 /* last byte written */ + \ 389 2 /* last byte written */ + \
396 1 /* nt_timechanged (false) */ + \ 390 1 /* nt_timechanged (false) */ + \
397 1 /* layoutupdate4 layout type */ + \ 391 1 /* layoutupdate4 layout type */ + \
398 1 /* NULL filelayout layoutupdate4 payload */) 392 1 /* layoutupdate4 opaqueue len */)
393 /* the actual content of layoutupdate4 should
394 be allocated by drivers and spliced in
395 using xdr_write_pages */
399#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 396#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
400#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ 397#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
401 encode_stateid_maxsz + \ 398 encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
809#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 806#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
810 decode_sequence_maxsz + \ 807 decode_sequence_maxsz + \
811 decode_reclaim_complete_maxsz) 808 decode_reclaim_complete_maxsz)
812#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
813 encode_sequence_maxsz + \
814 encode_putfh_maxsz + \
815 encode_getdevicelist_maxsz)
816#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
817 decode_sequence_maxsz + \
818 decode_putfh_maxsz + \
819 decode_getdevicelist_maxsz)
820#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 809#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
821 encode_sequence_maxsz +\ 810 encode_sequence_maxsz +\
822 encode_getdeviceinfo_maxsz) 811 encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
1927 1916
1928#ifdef CONFIG_NFS_V4_1 1917#ifdef CONFIG_NFS_V4_1
1929static void 1918static void
1930encode_getdevicelist(struct xdr_stream *xdr,
1931 const struct nfs4_getdevicelist_args *args,
1932 struct compound_hdr *hdr)
1933{
1934 __be32 *p;
1935 nfs4_verifier dummy = {
1936 .data = "dummmmmy",
1937 };
1938
1939 encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
1940 p = reserve_space(xdr, 16);
1941 *p++ = cpu_to_be32(args->layoutclass);
1942 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1943 xdr_encode_hyper(p, 0ULL); /* cookie */
1944 encode_nfs4_verifier(xdr, &dummy);
1945}
1946
1947static void
1948encode_getdeviceinfo(struct xdr_stream *xdr, 1919encode_getdeviceinfo(struct xdr_stream *xdr,
1949 const struct nfs4_getdeviceinfo_args *args, 1920 const struct nfs4_getdeviceinfo_args *args,
1950 struct compound_hdr *hdr) 1921 struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1952 __be32 *p; 1923 __be32 *p;
1953 1924
1954 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); 1925 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
1955 p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); 1926 p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
1956 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1927 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1957 NFS4_DEVICEID4_SIZE); 1928 NFS4_DEVICEID4_SIZE);
1958 *p++ = cpu_to_be32(args->pdev->layout_type); 1929 *p++ = cpu_to_be32(args->pdev->layout_type);
1959 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ 1930 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1960 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1931
1932 p = reserve_space(xdr, 4 + 4);
1933 *p++ = cpu_to_be32(1); /* bitmap length */
1934 *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
1961} 1935}
1962 1936
1963static void 1937static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
1990static int 1964static int
1991encode_layoutcommit(struct xdr_stream *xdr, 1965encode_layoutcommit(struct xdr_stream *xdr,
1992 struct inode *inode, 1966 struct inode *inode,
1993 const struct nfs4_layoutcommit_args *args, 1967 struct nfs4_layoutcommit_args *args,
1994 struct compound_hdr *hdr) 1968 struct compound_hdr *hdr)
1995{ 1969{
1996 __be32 *p; 1970 __be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
2011 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1985 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
2012 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1986 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
2013 1987
2014 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) 1988 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
2015 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( 1989 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
2016 NFS_I(inode)->layout, xdr, args); 1990 NFS_I(inode)->layout, xdr, args);
2017 else 1991 } else {
2018 encode_uint32(xdr, 0); /* no layout-type payload */ 1992 encode_uint32(xdr, args->layoutupdate_len);
1993 if (args->layoutupdate_pages) {
1994 xdr_write_pages(xdr, args->layoutupdate_pages, 0,
1995 args->layoutupdate_len);
1996 }
1997 }
2019 1998
2020 return 0; 1999 return 0;
2021} 2000}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2893} 2872}
2894 2873
2895/* 2874/*
2896 * Encode GETDEVICELIST request
2897 */
2898static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2899 struct xdr_stream *xdr,
2900 struct nfs4_getdevicelist_args *args)
2901{
2902 struct compound_hdr hdr = {
2903 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2904 };
2905
2906 encode_compound_hdr(xdr, req, &hdr);
2907 encode_sequence(xdr, &args->seq_args, &hdr);
2908 encode_putfh(xdr, args->fh, &hdr);
2909 encode_getdevicelist(xdr, args, &hdr);
2910 encode_nops(&hdr);
2911}
2912
2913/*
2914 * Encode GETDEVICEINFO request 2875 * Encode GETDEVICEINFO request
2915 */ 2876 */
2916static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2877static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
5765} 5726}
5766 5727
5767#if defined(CONFIG_NFS_V4_1) 5728#if defined(CONFIG_NFS_V4_1)
5768/*
5769 * TODO: Need to handle case when EOF != true;
5770 */
5771static int decode_getdevicelist(struct xdr_stream *xdr,
5772 struct pnfs_devicelist *res)
5773{
5774 __be32 *p;
5775 int status, i;
5776 nfs4_verifier verftemp;
5777
5778 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5779 if (status)
5780 return status;
5781
5782 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5783 if (unlikely(!p))
5784 goto out_overflow;
5785
5786 /* TODO: Skip cookie for now */
5787 p += 2;
5788
5789 /* Read verifier */
5790 p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
5791
5792 res->num_devs = be32_to_cpup(p);
5793
5794 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5795
5796 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5797 printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
5798 __func__, res->num_devs);
5799 return -EIO;
5800 }
5801
5802 p = xdr_inline_decode(xdr,
5803 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5804 if (unlikely(!p))
5805 goto out_overflow;
5806 for (i = 0; i < res->num_devs; i++)
5807 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5808 NFS4_DEVICEID4_SIZE);
5809 res->eof = be32_to_cpup(p);
5810 return 0;
5811out_overflow:
5812 print_overflow_msg(__func__, xdr);
5813 return -EIO;
5814}
5815
5816static int decode_getdeviceinfo(struct xdr_stream *xdr, 5729static int decode_getdeviceinfo(struct xdr_stream *xdr,
5817 struct pnfs_device *pdev) 5730 struct pnfs_device *pdev)
5818{ 5731{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5862 p = xdr_inline_decode(xdr, 4 * len); 5775 p = xdr_inline_decode(xdr, 4 * len);
5863 if (unlikely(!p)) 5776 if (unlikely(!p))
5864 goto out_overflow; 5777 goto out_overflow;
5865 for (i = 0; i < len; i++, p++) { 5778
5866 if (be32_to_cpup(p)) { 5779 if (be32_to_cpup(p++) &
5867 dprintk("%s: notifications not supported\n", 5780 ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
5781 dprintk("%s: unsupported notification\n",
5782 __func__);
5783 }
5784
5785 for (i = 1; i < len; i++) {
5786 if (be32_to_cpup(p++)) {
5787 dprintk("%s: unsupported notification\n",
5868 __func__); 5788 __func__);
5869 return -EIO; 5789 return -EIO;
5870 } 5790 }
@@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
7097} 7017}
7098 7018
7099/* 7019/*
7100 * Decode GETDEVICELIST response
7101 */
7102static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
7103 struct xdr_stream *xdr,
7104 struct nfs4_getdevicelist_res *res)
7105{
7106 struct compound_hdr hdr;
7107 int status;
7108
7109 dprintk("encoding getdevicelist!\n");
7110
7111 status = decode_compound_hdr(xdr, &hdr);
7112 if (status != 0)
7113 goto out;
7114 status = decode_sequence(xdr, &res->seq_res, rqstp);
7115 if (status != 0)
7116 goto out;
7117 status = decode_putfh(xdr);
7118 if (status != 0)
7119 goto out;
7120 status = decode_getdevicelist(xdr, res->devlist);
7121out:
7122 return status;
7123}
7124
7125/*
7126 * Decode GETDEVINFO response 7020 * Decode GETDEVINFO response
7127 */ 7021 */
7128static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, 7022static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -7490,7 +7384,6 @@ struct rpc_procinfo nfs4_procedures[] = {
7490 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7384 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7491 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7385 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7492 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 7386 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7493 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
7494 PROC(BIND_CONN_TO_SESSION, 7387 PROC(BIND_CONN_TO_SESSION,
7495 enc_bind_conn_to_session, dec_bind_conn_to_session), 7388 enc_bind_conn_to_session, dec_bind_conn_to_session),
7496 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), 7389 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60 kfree(de); 60 kfree(de);
61} 61}
62 62
63static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
64 const struct nfs4_deviceid *d_id)
65{
66 struct nfs4_deviceid_node *d;
67 struct objio_dev_ent *de;
68
69 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
70 if (!d)
71 return NULL;
72
73 de = container_of(d, struct objio_dev_ent, id_node);
74 return de;
75}
76
77static struct objio_dev_ent *
78_dev_list_add(const struct nfs_server *nfss,
79 const struct nfs4_deviceid *d_id, struct osd_dev *od,
80 gfp_t gfp_flags)
81{
82 struct nfs4_deviceid_node *d;
83 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
84 struct objio_dev_ent *n;
85
86 if (!de) {
87 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
88 return NULL;
89 }
90
91 dprintk("%s: Adding od=%p\n", __func__, od);
92 nfs4_init_deviceid_node(&de->id_node,
93 nfss->pnfs_curr_ld,
94 nfss->nfs_client,
95 d_id);
96 de->od.od = od;
97
98 d = nfs4_insert_deviceid_node(&de->id_node);
99 n = container_of(d, struct objio_dev_ent, id_node);
100 if (n != de) {
101 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
102 objio_free_deviceid_node(&de->id_node);
103 de = n;
104 }
105
106 return de;
107}
108
109struct objio_segment { 63struct objio_segment {
110 struct pnfs_layout_segment lseg; 64 struct pnfs_layout_segment lseg;
111 65
@@ -130,29 +84,24 @@ struct objio_state {
130 84
131/* Send and wait for a get_device_info of devices in the layout, 85/* Send and wait for a get_device_info of devices in the layout,
132 then look them up with the osd_initiator library */ 86 then look them up with the osd_initiator library */
133static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 87struct nfs4_deviceid_node *
134 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, 88objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
135 gfp_t gfp_flags) 89 gfp_t gfp_flags)
136{ 90{
137 struct pnfs_osd_deviceaddr *deviceaddr; 91 struct pnfs_osd_deviceaddr *deviceaddr;
138 struct objio_dev_ent *ode; 92 struct objio_dev_ent *ode = NULL;
139 struct osd_dev *od; 93 struct osd_dev *od;
140 struct osd_dev_info odi; 94 struct osd_dev_info odi;
141 bool retry_flag = true; 95 bool retry_flag = true;
96 __be32 *p;
142 int err; 97 int err;
143 98
144 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 99 deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
145 if (ode) { 100 if (!deviceaddr)
146 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ 101 return NULL;
147 return 0;
148 }
149 102
150 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 103 p = page_address(pdev->pages[0]);
151 if (unlikely(err)) { 104 pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
152 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
153 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
154 return err;
155 }
156 105
157 odi.systemid_len = deviceaddr->oda_systemid.len; 106 odi.systemid_len = deviceaddr->oda_systemid.len;
158 if (odi.systemid_len > sizeof(odi.systemid)) { 107 if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
188 goto out; 137 goto out;
189 } 138 }
190 139
191 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
192 gfp_flags);
193 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
194 dprintk("Adding new dev_id(%llx:%llx)\n", 140 dprintk("Adding new dev_id(%llx:%llx)\n",
195 _DEVID_LO(d_id), _DEVID_HI(d_id)); 141 _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
142
143 ode = kzalloc(sizeof(*ode), gfp_flags);
144 if (!ode) {
145 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
146 goto out;
147 }
148
149 nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
150 kfree(deviceaddr);
151
152 ode->od.od = od;
153 return &ode->id_node;
154
196out: 155out:
197 objlayout_put_deviceinfo(deviceaddr); 156 kfree(deviceaddr);
198 return err; 157 return NULL;
199} 158}
200 159
201static void copy_single_comp(struct ore_components *oc, unsigned c, 160static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
254 struct xdr_stream *xdr, 213 struct xdr_stream *xdr,
255 gfp_t gfp_flags) 214 gfp_t gfp_flags)
256{ 215{
216 struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
257 struct objio_segment *objio_seg; 217 struct objio_segment *objio_seg;
258 struct pnfs_osd_xdr_decode_layout_iter iter; 218 struct pnfs_osd_xdr_decode_layout_iter iter;
259 struct pnfs_osd_layout layout; 219 struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
283 objio_seg->oc.first_dev = layout.olo_comps_index; 243 objio_seg->oc.first_dev = layout.olo_comps_index;
284 cur_comp = 0; 244 cur_comp = 0;
285 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { 245 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
246 struct nfs4_deviceid_node *d;
247 struct objio_dev_ent *ode;
248
286 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); 249 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
287 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, 250
288 &src_comp.oc_object_id.oid_device_id, 251 d = nfs4_find_get_deviceid(server,
289 gfp_flags); 252 &src_comp.oc_object_id.oid_device_id,
290 if (err) 253 pnfslay->plh_lc_cred, gfp_flags);
254 if (!d) {
255 err = -ENXIO;
291 goto err; 256 goto err;
292 ++cur_comp; 257 }
258
259 ode = container_of(d, struct objio_dev_ent, id_node);
260 objio_seg->oc.ods[cur_comp++] = &ode->od;
293 } 261 }
294 /* pnfs_osd_xdr_decode_layout_comp returns false on error */ 262 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
295 if (unlikely(err)) 263 if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
653 .flags = PNFS_LAYOUTRET_ON_SETATTR | 621 .flags = PNFS_LAYOUTRET_ON_SETATTR |
654 PNFS_LAYOUTRET_ON_ERROR, 622 PNFS_LAYOUTRET_ON_ERROR,
655 623
624 .max_deviceinfo_size = PAGE_SIZE,
656 .owner = THIS_MODULE, 625 .owner = THIS_MODULE,
657 .alloc_layout_hdr = objlayout_alloc_layout_hdr, 626 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
658 .free_layout_hdr = objlayout_free_layout_hdr, 627 .free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
574 dprintk("%s: Return\n", __func__); 574 dprintk("%s: Return\n", __func__);
575} 575}
576 576
577
578/*
579 * Get Device Info API for io engines
580 */
581struct objlayout_deviceinfo {
582 struct page *page;
583 struct pnfs_osd_deviceaddr da; /* This must be last */
584};
585
586/* Initialize and call nfs_getdeviceinfo, then decode and return a
587 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
588 * should be called.
589 */
590int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
591 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
592 gfp_t gfp_flags)
593{
594 struct objlayout_deviceinfo *odi;
595 struct pnfs_device pd;
596 struct page *page, **pages;
597 u32 *p;
598 int err;
599
600 page = alloc_page(gfp_flags);
601 if (!page)
602 return -ENOMEM;
603
604 pages = &page;
605 pd.pages = pages;
606
607 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
608 pd.layout_type = LAYOUT_OSD2_OBJECTS;
609 pd.pages = &page;
610 pd.pgbase = 0;
611 pd.pglen = PAGE_SIZE;
612 pd.mincount = 0;
613 pd.maxcount = PAGE_SIZE;
614
615 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
616 pnfslay->plh_lc_cred);
617 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
618 if (err)
619 goto err_out;
620
621 p = page_address(page);
622 odi = kzalloc(sizeof(*odi), gfp_flags);
623 if (!odi) {
624 err = -ENOMEM;
625 goto err_out;
626 }
627 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
628 odi->page = page;
629 *deviceaddr = &odi->da;
630 return 0;
631
632err_out:
633 __free_page(page);
634 return err;
635}
636
637void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
638{
639 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
640 struct objlayout_deviceinfo,
641 da);
642
643 __free_page(odi->page);
644 kfree(odi);
645}
646
647enum { 577enum {
648 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, 578 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
649 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, 579 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
149extern void objlayout_write_done(struct objlayout_io_res *oir, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/* 152/*
158 * exported generic objects function vectors 153 * exported generic objects function vectors
159 */ 154 */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index be7cbce6e4c7..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
481 return 0; 481 return 0;
482 } 482 }
483 483
484 /*
485 * Limit the request size so that we can still allocate a page array
486 * for it without upsetting the slab allocator.
487 */
488 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
489 sizeof(struct page) > PAGE_SIZE)
490 return 0;
491
484 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 492 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
485} 493}
486EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 494EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
594 dprintk("%s freeing layout for inode %lu\n", __func__, 594 dprintk("%s freeing layout for inode %lu\n", __func__,
595 lo->plh_inode->i_ino); 595 lo->plh_inode->i_ino);
596 inode = lo->plh_inode; 596 inode = lo->plh_inode;
597
598 pnfs_layoutcommit_inode(inode, false);
599
597 spin_lock(&inode->i_lock); 600 spin_lock(&inode->i_lock);
598 list_del_init(&lo->plh_bulk_destroy); 601 list_del_init(&lo->plh_bulk_destroy);
599 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 602 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
682 return (s32)(s1 - s2) > 0; 685 return (s32)(s1 - s2) > 0;
683} 686}
684 687
685static void
686pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
687 const nfs4_stateid *new,
688 struct list_head *free_me_list)
689{
690 if (nfs4_stateid_match_other(&lo->plh_stateid, new))
691 return;
692 /* Layout is new! Kill existing layout segments */
693 pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
694}
695
696/* update lo->plh_stateid with new if is more recent */ 688/* update lo->plh_stateid with new if is more recent */
697void 689void
698pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 690pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
749 status = -EAGAIN; 741 status = -EAGAIN;
750 } else if (!nfs4_valid_open_stateid(open_state)) { 742 } else if (!nfs4_valid_open_stateid(open_state)) {
751 status = -EBADF; 743 status = -EBADF;
752 } else if (list_empty(&lo->plh_segs)) { 744 } else if (list_empty(&lo->plh_segs) ||
745 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
753 int seq; 746 int seq;
754 747
755 do { 748 do {
@@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
864 empty = list_empty(&lo->plh_segs); 857 empty = list_empty(&lo->plh_segs);
865 pnfs_clear_layoutcommit(ino, &tmp_list); 858 pnfs_clear_layoutcommit(ino, &tmp_list);
866 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 859 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
860
861 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
862 struct pnfs_layout_range range = {
863 .iomode = IOMODE_ANY,
864 .offset = 0,
865 .length = NFS4_MAX_UINT64,
866 };
867 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
868 }
869
867 /* Don't send a LAYOUTRETURN if list was initially empty */ 870 /* Don't send a LAYOUTRETURN if list was initially empty */
868 if (empty) { 871 if (empty) {
869 spin_unlock(&ino->i_lock); 872 spin_unlock(&ino->i_lock);
@@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino)
871 dprintk("NFS: %s no layout segments to return\n", __func__); 874 dprintk("NFS: %s no layout segments to return\n", __func__);
872 goto out; 875 goto out;
873 } 876 }
877
878 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
874 lo->plh_block_lgets++; 879 lo->plh_block_lgets++;
875 spin_unlock(&ino->i_lock); 880 spin_unlock(&ino->i_lock);
876 pnfs_free_lseg_list(&tmp_list); 881 pnfs_free_lseg_list(&tmp_list);
@@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1358 goto out; 1363 goto out;
1359 } 1364 }
1360 1365
1366 init_lseg(lo, lseg);
1367 lseg->pls_range = res->range;
1368
1361 spin_lock(&ino->i_lock); 1369 spin_lock(&ino->i_lock);
1362 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1370 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1363 dprintk("%s forget reply due to recall\n", __func__); 1371 dprintk("%s forget reply due to recall\n", __func__);
1364 goto out_forget_reply; 1372 goto out_forget_reply;
1365 } 1373 }
1366 1374
1367 if (pnfs_layoutgets_blocked(lo, 1) || 1375 if (pnfs_layoutgets_blocked(lo, 1)) {
1368 pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1369 dprintk("%s forget reply due to state\n", __func__); 1376 dprintk("%s forget reply due to state\n", __func__);
1370 goto out_forget_reply; 1377 goto out_forget_reply;
1371 } 1378 }
1372 1379
1373 /* Check that the new stateid matches the old stateid */ 1380 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1374 pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); 1381 /* existing state ID, make sure the sequence number matches. */
1375 /* Done processing layoutget. Set the layout stateid */ 1382 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1376 pnfs_set_layout_stateid(lo, &res->stateid, false); 1383 dprintk("%s forget reply due to sequence\n", __func__);
1384 goto out_forget_reply;
1385 }
1386 pnfs_set_layout_stateid(lo, &res->stateid, false);
1387 } else {
1388 /*
1389 * We got an entirely new state ID. Mark all segments for the
1390 * inode invalid, and don't bother validating the stateid
1391 * sequence number.
1392 */
1393 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1394
1395 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1396 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1397 }
1398
1399 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1377 1400
1378 init_lseg(lo, lseg);
1379 lseg->pls_range = res->range;
1380 pnfs_get_lseg(lseg); 1401 pnfs_get_lseg(lseg);
1381 pnfs_layout_insert_lseg(lo, lseg); 1402 pnfs_layout_insert_lseg(lo, lseg);
1382 1403
@@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
1797} 1818}
1798EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1819EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1799 1820
1821void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
1822{
1823 struct inode *inode = data->inode;
1824 struct nfs_inode *nfsi = NFS_I(inode);
1825 bool mark_as_dirty = false;
1826
1827 spin_lock(&inode->i_lock);
1828 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1829 mark_as_dirty = true;
1830 dprintk("%s: Set layoutcommit for inode %lu ",
1831 __func__, inode->i_ino);
1832 }
1833 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
1834 /* references matched in nfs4_layoutcommit_release */
1835 pnfs_get_lseg(data->lseg);
1836 }
1837 if (data->lwb > nfsi->layout->plh_lwb)
1838 nfsi->layout->plh_lwb = data->lwb;
1839 spin_unlock(&inode->i_lock);
1840 dprintk("%s: lseg %p end_pos %llu\n",
1841 __func__, data->lseg, nfsi->layout->plh_lwb);
1842
1843 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1844 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1845 if (mark_as_dirty)
1846 mark_inode_dirty_sync(inode);
1847}
1848EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
1849
1800void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1850void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1801{ 1851{
1802 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1852 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1817int 1867int
1818pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1868pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1819{ 1869{
1870 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1820 struct nfs4_layoutcommit_data *data; 1871 struct nfs4_layoutcommit_data *data;
1821 struct nfs_inode *nfsi = NFS_I(inode); 1872 struct nfs_inode *nfsi = NFS_I(inode);
1822 loff_t end_pos; 1873 loff_t end_pos;
@@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1867 data->args.lastbytewritten = end_pos - 1; 1918 data->args.lastbytewritten = end_pos - 1;
1868 data->res.server = NFS_SERVER(inode); 1919 data->res.server = NFS_SERVER(inode);
1869 1920
1921 if (ld->prepare_layoutcommit) {
1922 status = ld->prepare_layoutcommit(&data->args);
1923 if (status) {
1924 spin_lock(&inode->i_lock);
1925 if (end_pos < nfsi->layout->plh_lwb)
1926 nfsi->layout->plh_lwb = end_pos;
1927 spin_unlock(&inode->i_lock);
1928 put_rpccred(data->cred);
1929 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
1930 goto clear_layoutcommitting;
1931 }
1932 }
1933
1934
1870 status = nfs4_proc_layoutcommit(data, sync); 1935 status = nfs4_proc_layoutcommit(data, sync);
1871out: 1936out:
1872 if (status) 1937 if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,12 +65,15 @@ enum {
65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
68}; 69};
69 70
70enum layoutdriver_policy_flags { 71enum layoutdriver_policy_flags {
71 /* Should the pNFS client commit and return the layout upon a setattr */ 72 /* Should the pNFS client commit and return the layout upon truncate to
73 * a smaller size */
72 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, 74 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
73 PNFS_LAYOUTRET_ON_ERROR = 1 << 1, 75 PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
76 PNFS_READ_WHOLE_PAGE = 1 << 2,
74}; 77};
75 78
76struct nfs4_deviceid_node; 79struct nfs4_deviceid_node;
@@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type {
82 const char *name; 85 const char *name;
83 struct module *owner; 86 struct module *owner;
84 unsigned flags; 87 unsigned flags;
88 unsigned max_deviceinfo_size;
85 89
86 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); 90 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
87 int (*clear_layoutdriver) (struct nfs_server *); 91 int (*clear_layoutdriver) (struct nfs_server *);
@@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type {
92 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 96 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
93 void (*free_lseg) (struct pnfs_layout_segment *lseg); 97 void (*free_lseg) (struct pnfs_layout_segment *lseg);
94 98
99 void (*return_range) (struct pnfs_layout_hdr *lo,
100 struct pnfs_layout_range *range);
101
95 /* test for nfs page cache coalescing */ 102 /* test for nfs page cache coalescing */
96 const struct nfs_pageio_ops *pg_read_ops; 103 const struct nfs_pageio_ops *pg_read_ops;
97 const struct nfs_pageio_ops *pg_write_ops; 104 const struct nfs_pageio_ops *pg_write_ops;
@@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type {
121 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); 128 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
122 129
123 void (*free_deviceid_node) (struct nfs4_deviceid_node *); 130 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
131 struct nfs4_deviceid_node * (*alloc_deviceid_node)
132 (struct nfs_server *server, struct pnfs_device *pdev,
133 gfp_t gfp_flags);
124 134
125 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, 135 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
126 struct xdr_stream *xdr, 136 struct xdr_stream *xdr,
127 const struct nfs4_layoutreturn_args *args); 137 const struct nfs4_layoutreturn_args *args);
128 138
129 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); 139 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
130 140 int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
131 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 141 void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
132 struct xdr_stream *xdr, 142 struct xdr_stream *xdr,
133 const struct nfs4_layoutcommit_args *args); 143 const struct nfs4_layoutcommit_args *args);
134}; 144};
@@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
171extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 181extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
172 182
173/* nfs4proc.c */ 183/* nfs4proc.c */
174extern int nfs4_proc_getdevicelist(struct nfs_server *server,
175 const struct nfs_fh *fh,
176 struct pnfs_devicelist *devlist);
177extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 184extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
178 struct pnfs_device *dev, 185 struct pnfs_device *dev,
179 struct rpc_cred *cred); 186 struct rpc_cred *cred);
@@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino);
219void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 226void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
220bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 227bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
221void pnfs_set_layoutcommit(struct nfs_pgio_header *); 228void pnfs_set_layoutcommit(struct nfs_pgio_header *);
229void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
222void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 230void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
223int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 231int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
224int _pnfs_return_layout(struct inode *); 232int _pnfs_return_layout(struct inode *);
@@ -255,11 +263,12 @@ struct nfs4_deviceid_node {
255 atomic_t ref; 263 atomic_t ref;
256}; 264};
257 265
258struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 266struct nfs4_deviceid_node *
267nfs4_find_get_deviceid(struct nfs_server *server,
268 const struct nfs4_deviceid *id, struct rpc_cred *cred,
269 gfp_t gfp_mask);
259void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 270void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
260void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 271void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
261 const struct pnfs_layoutdriver_type *,
262 const struct nfs_client *,
263 const struct nfs4_deviceid *); 272 const struct nfs4_deviceid *);
264struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); 273struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
265bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); 274bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
@@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
267bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 276bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
268void nfs4_deviceid_purge_client(const struct nfs_client *); 277void nfs4_deviceid_purge_client(const struct nfs_client *);
269 278
279static inline struct nfs4_deviceid_node *
280nfs4_get_deviceid(struct nfs4_deviceid_node *d)
281{
282 atomic_inc(&d->ref);
283 return d;
284}
285
270static inline struct pnfs_layout_segment * 286static inline struct pnfs_layout_segment *
271pnfs_get_lseg(struct pnfs_layout_segment *lseg) 287pnfs_get_lseg(struct pnfs_layout_segment *lseg)
272{ 288{
@@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
368} 384}
369 385
370static inline bool 386static inline bool
387pnfs_ld_read_whole_page(struct inode *inode)
388{
389 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
390 return false;
391 return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
392}
393
394static inline bool
371pnfs_layoutcommit_outstanding(struct inode *inode) 395pnfs_layoutcommit_outstanding(struct inode *inode)
372{ 396{
373 struct nfs_inode *nfsi = NFS_I(inode); 397 struct nfs_inode *nfsi = NFS_I(inode);
@@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
443} 467}
444 468
445static inline bool 469static inline bool
470pnfs_ld_read_whole_page(struct inode *inode)
471{
472 return false;
473}
474
475static inline bool
446pnfs_roc(struct inode *ino) 476pnfs_roc(struct inode *ino)
447{ 477{
448 return false; 478 return false;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
29 */ 29 */
30 30
31#include <linux/export.h> 31#include <linux/export.h>
32#include <linux/nfs_fs.h>
33#include "nfs4session.h"
34#include "internal.h"
32#include "pnfs.h" 35#include "pnfs.h"
33 36
34#define NFSDBG_FACILITY NFSDBG_PNFS 37#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
89 return NULL; 92 return NULL;
90} 93}
91 94
95static struct nfs4_deviceid_node *
96nfs4_get_device_info(struct nfs_server *server,
97 const struct nfs4_deviceid *dev_id,
98 struct rpc_cred *cred, gfp_t gfp_flags)
99{
100 struct nfs4_deviceid_node *d = NULL;
101 struct pnfs_device *pdev = NULL;
102 struct page **pages = NULL;
103 u32 max_resp_sz;
104 int max_pages;
105 int rc, i;
106
107 /*
108 * Use the session max response size as the basis for setting
109 * GETDEVICEINFO's maxcount
110 */
111 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
112 if (server->pnfs_curr_ld->max_deviceinfo_size &&
113 server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
114 max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
115 max_pages = nfs_page_array_len(0, max_resp_sz);
116 dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
117 __func__, server, max_resp_sz, max_pages);
118
119 pdev = kzalloc(sizeof(*pdev), gfp_flags);
120 if (!pdev)
121 return NULL;
122
123 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
124 if (!pages)
125 goto out_free_pdev;
126
127 for (i = 0; i < max_pages; i++) {
128 pages[i] = alloc_page(gfp_flags);
129 if (!pages[i])
130 goto out_free_pages;
131 }
132
133 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
134 pdev->layout_type = server->pnfs_curr_ld->id;
135 pdev->pages = pages;
136 pdev->pgbase = 0;
137 pdev->pglen = max_resp_sz;
138 pdev->mincount = 0;
139 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
140
141 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
142 dprintk("%s getdevice info returns %d\n", __func__, rc);
143 if (rc)
144 goto out_free_pages;
145
146 /*
147 * Found new device, need to decode it and then add it to the
148 * list of known devices for this mountpoint.
149 */
150 d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
151 gfp_flags);
152
153out_free_pages:
154 for (i = 0; i < max_pages; i++)
155 __free_page(pages[i]);
156 kfree(pages);
157out_free_pdev:
158 kfree(pdev);
159 dprintk("<-- %s d %p\n", __func__, d);
160 return d;
161}
162
92/* 163/*
93 * Lookup a deviceid in cache and get a reference count on it if found 164 * Lookup a deviceid in cache and get a reference count on it if found
94 * 165 *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
96 * @id deviceid to look up 167 * @id deviceid to look up
97 */ 168 */
98static struct nfs4_deviceid_node * 169static struct nfs4_deviceid_node *
99_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 170__nfs4_find_get_deviceid(struct nfs_server *server,
100 const struct nfs_client *clp, const struct nfs4_deviceid *id, 171 const struct nfs4_deviceid *id, long hash)
101 long hash)
102{ 172{
103 struct nfs4_deviceid_node *d; 173 struct nfs4_deviceid_node *d;
104 174
105 rcu_read_lock(); 175 rcu_read_lock();
106 d = _lookup_deviceid(ld, clp, id, hash); 176 d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
177 hash);
107 if (d != NULL) 178 if (d != NULL)
108 atomic_inc(&d->ref); 179 atomic_inc(&d->ref);
109 rcu_read_unlock(); 180 rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111} 182}
112 183
113struct nfs4_deviceid_node * 184struct nfs4_deviceid_node *
114nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 185nfs4_find_get_deviceid(struct nfs_server *server,
115 const struct nfs_client *clp, const struct nfs4_deviceid *id) 186 const struct nfs4_deviceid *id, struct rpc_cred *cred,
187 gfp_t gfp_mask)
116{ 188{
117 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); 189 long hash = nfs4_deviceid_hash(id);
190 struct nfs4_deviceid_node *d, *new;
191
192 d = __nfs4_find_get_deviceid(server, id, hash);
193 if (d)
194 return d;
195
196 new = nfs4_get_device_info(server, id, cred, gfp_mask);
197 if (!new)
198 return new;
199
200 spin_lock(&nfs4_deviceid_lock);
201 d = __nfs4_find_get_deviceid(server, id, hash);
202 if (d) {
203 spin_unlock(&nfs4_deviceid_lock);
204 server->pnfs_curr_ld->free_deviceid_node(new);
205 return d;
206 }
207 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
208 atomic_inc(&new->ref);
209 spin_unlock(&nfs4_deviceid_lock);
210
211 return new;
118} 212}
119EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); 213EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
120 214
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
151EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 245EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
152 246
153void 247void
154nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, 248nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
155 const struct pnfs_layoutdriver_type *ld,
156 const struct nfs_client *nfs_client,
157 const struct nfs4_deviceid *id) 249 const struct nfs4_deviceid *id)
158{ 250{
159 INIT_HLIST_NODE(&d->node); 251 INIT_HLIST_NODE(&d->node);
160 INIT_HLIST_NODE(&d->tmpnode); 252 INIT_HLIST_NODE(&d->tmpnode);
161 d->ld = ld; 253 d->ld = server->pnfs_curr_ld;
162 d->nfs_client = nfs_client; 254 d->nfs_client = server->nfs_client;
163 d->flags = 0; 255 d->flags = 0;
164 d->deviceid = *id; 256 d->deviceid = *id;
165 atomic_set(&d->ref, 1); 257 atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
167EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); 259EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
168 260
169/* 261/*
170 * Uniquely initialize and insert a deviceid node into cache
171 *
172 * @new new deviceid node
173 * Note that the caller must set up the following members:
174 * new->ld
175 * new->nfs_client
176 * new->deviceid
177 *
178 * @ret the inserted node, if none found, otherwise, the found entry.
179 */
180struct nfs4_deviceid_node *
181nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
182{
183 struct nfs4_deviceid_node *d;
184 long hash;
185
186 spin_lock(&nfs4_deviceid_lock);
187 hash = nfs4_deviceid_hash(&new->deviceid);
188 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
189 if (d) {
190 spin_unlock(&nfs4_deviceid_lock);
191 return d;
192 }
193
194 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
195 spin_unlock(&nfs4_deviceid_lock);
196 atomic_inc(&new->ref);
197
198 return new;
199}
200EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
201
202/*
203 * Dereference a deviceid node and delete it when its reference count drops 262 * Dereference a deviceid node and delete it when its reference count drops
204 * to zero. 263 * to zero.
205 * 264 *
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
299 } 358 }
300 rcu_read_unlock(); 359 rcu_read_unlock();
301} 360}
302
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
2065 return NFS_TEXT_DATA; 2065 return NFS_TEXT_DATA;
2066 } 2066 }
2067 2067
2068#if !IS_ENABLED(CONFIG_NFS_V3)
2069 if (args->version == 3)
2070 goto out_v3_not_compiled;
2071#endif /* !CONFIG_NFS_V3 */
2072
2073 return 0; 2068 return 0;
2074 2069
2075out_no_data: 2070out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
2085 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); 2080 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
2086 return -EINVAL; 2081 return -EINVAL;
2087 2082
2088#if !IS_ENABLED(CONFIG_NFS_V3)
2089out_v3_not_compiled:
2090 dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
2091 return -EPROTONOSUPPORT;
2092#endif /* !CONFIG_NFS_V3 */
2093
2094out_nomem: 2083out_nomem:
2095 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 2084 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
2096 return -ENOMEM; 2085 return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 175d5d073ccf..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
49static void nfs_clear_request_commit(struct nfs_page *req); 49static void nfs_clear_request_commit(struct nfs_page *req);
50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, 50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
51 struct inode *inode); 51 struct inode *inode);
52static struct nfs_page *
53nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
54 struct page *page);
52 55
53static struct kmem_cache *nfs_wdata_cachep; 56static struct kmem_cache *nfs_wdata_cachep;
54static mempool_t *nfs_wdata_mempool; 57static mempool_t *nfs_wdata_mempool;
@@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
95} 98}
96 99
97/* 100/*
98 * nfs_page_search_commits_for_head_request_locked
99 *
100 * Search through commit lists on @inode for the head request for @page.
101 * Must be called while holding the inode (which is cinfo) lock.
102 *
103 * Returns the head request if found, or NULL if not found.
104 */
105static struct nfs_page *
106nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
107 struct page *page)
108{
109 struct nfs_page *freq, *t;
110 struct nfs_commit_info cinfo;
111 struct inode *inode = &nfsi->vfs_inode;
112
113 nfs_init_cinfo_from_inode(&cinfo, inode);
114
115 /* search through pnfs commit lists */
116 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
117 if (freq)
118 return freq->wb_head;
119
120 /* Linearly search the commit list for the correct request */
121 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
122 if (freq->wb_page == page)
123 return freq->wb_head;
124 }
125
126 return NULL;
127}
128
129/*
130 * nfs_page_find_head_request_locked - find head request associated with @page 101 * nfs_page_find_head_request_locked - find head request associated with @page
131 * 102 *
132 * must be called while holding the inode lock. 103 * must be called while holding the inode lock.
@@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
271 242
272static int wb_priority(struct writeback_control *wbc) 243static int wb_priority(struct writeback_control *wbc)
273{ 244{
245 int ret = 0;
274 if (wbc->for_reclaim) 246 if (wbc->for_reclaim)
275 return FLUSH_HIGHPRI | FLUSH_STABLE; 247 return FLUSH_HIGHPRI | FLUSH_STABLE;
248 if (wbc->sync_mode == WB_SYNC_ALL)
249 ret = FLUSH_COND_STABLE;
276 if (wbc->for_kupdate || wbc->for_background) 250 if (wbc->for_kupdate || wbc->for_background)
277 return FLUSH_LOWPRI | FLUSH_COND_STABLE; 251 ret |= FLUSH_LOWPRI;
278 return FLUSH_COND_STABLE; 252 return ret;
279} 253}
280 254
281/* 255/*
@@ -731,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
731 if (likely(!PageSwapCache(head->wb_page))) { 705 if (likely(!PageSwapCache(head->wb_page))) {
732 set_page_private(head->wb_page, 0); 706 set_page_private(head->wb_page, 0);
733 ClearPagePrivate(head->wb_page); 707 ClearPagePrivate(head->wb_page);
708 smp_mb__after_atomic();
709 wake_up_page(head->wb_page, PG_private);
734 clear_bit(PG_MAPPED, &head->wb_flags); 710 clear_bit(PG_MAPPED, &head->wb_flags);
735 } 711 }
736 nfsi->npages--; 712 nfsi->npages--;
@@ -749,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
749 __set_page_dirty_nobuffers(req->wb_page); 725 __set_page_dirty_nobuffers(req->wb_page);
750} 726}
751 727
752#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 728/*
729 * nfs_page_search_commits_for_head_request_locked
730 *
731 * Search through commit lists on @inode for the head request for @page.
732 * Must be called while holding the inode (which is cinfo) lock.
733 *
734 * Returns the head request if found, or NULL if not found.
735 */
736static struct nfs_page *
737nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
738 struct page *page)
739{
740 struct nfs_page *freq, *t;
741 struct nfs_commit_info cinfo;
742 struct inode *inode = &nfsi->vfs_inode;
743
744 nfs_init_cinfo_from_inode(&cinfo, inode);
745
746 /* search through pnfs commit lists */
747 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
748 if (freq)
749 return freq->wb_head;
750
751 /* Linearly search the commit list for the correct request */
752 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
753 if (freq->wb_page == page)
754 return freq->wb_head;
755 }
756
757 return NULL;
758}
759
753/** 760/**
754 * nfs_request_add_commit_list - add request to a commit list 761 * nfs_request_add_commit_list - add request to a commit list
755 * @req: pointer to a struct nfs_page 762 * @req: pointer to a struct nfs_page
@@ -867,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
867 return hdr->verf.committed != NFS_FILE_SYNC; 874 return hdr->verf.committed != NFS_FILE_SYNC;
868} 875}
869 876
870#else
871static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
872 struct inode *inode)
873{
874}
875
876void nfs_init_cinfo(struct nfs_commit_info *cinfo,
877 struct inode *inode,
878 struct nfs_direct_req *dreq)
879{
880}
881
882void
883nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
884 struct nfs_commit_info *cinfo)
885{
886}
887
888static void
889nfs_clear_request_commit(struct nfs_page *req)
890{
891}
892
893int nfs_write_need_commit(struct nfs_pgio_header *hdr)
894{
895 return 0;
896}
897
898#endif
899
900static void nfs_write_completion(struct nfs_pgio_header *hdr) 877static void nfs_write_completion(struct nfs_pgio_header *hdr)
901{ 878{
902 struct nfs_commit_info cinfo; 879 struct nfs_commit_info cinfo;
@@ -932,7 +909,6 @@ out:
932 hdr->release(hdr); 909 hdr->release(hdr);
933} 910}
934 911
935#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
936unsigned long 912unsigned long
937nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 913nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
938{ 914{
@@ -989,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
989 return ret; 965 return ret;
990} 966}
991 967
992#else
993unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
994{
995 return 0;
996}
997
998int nfs_scan_commit(struct inode *inode, struct list_head *dst,
999 struct nfs_commit_info *cinfo)
1000{
1001 return 0;
1002}
1003#endif
1004
1005/* 968/*
1006 * Search for an existing write request, and attempt to update 969 * Search for an existing write request, and attempt to update
1007 * it to reflect a new dirty region on a given page. 970 * it to reflect a new dirty region on a given page.
@@ -1394,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task,
1394 return status; 1357 return status;
1395 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); 1358 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
1396 1359
1397#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1398 if (hdr->res.verf->committed < hdr->args.stable && 1360 if (hdr->res.verf->committed < hdr->args.stable &&
1399 task->tk_status >= 0) { 1361 task->tk_status >= 0) {
1400 /* We tried a write call, but the server did not 1362 /* We tried a write call, but the server did not
@@ -1416,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task,
1416 complain = jiffies + 300 * HZ; 1378 complain = jiffies + 300 * HZ;
1417 } 1379 }
1418 } 1380 }
1419#endif
1420 1381
1421 /* Deal with the suid/sgid bit corner case */ 1382 /* Deal with the suid/sgid bit corner case */
1422 if (nfs_should_remove_suid(inode)) 1383 if (nfs_should_remove_suid(inode))
@@ -1469,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task,
1469} 1430}
1470 1431
1471 1432
1472#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1473static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1433static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1474{ 1434{
1475 int ret; 1435 int ret;
@@ -1538,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1538} 1498}
1539EXPORT_SYMBOL_GPL(nfs_initiate_commit); 1499EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1540 1500
1501static loff_t nfs_get_lwb(struct list_head *head)
1502{
1503 loff_t lwb = 0;
1504 struct nfs_page *req;
1505
1506 list_for_each_entry(req, head, wb_list)
1507 if (lwb < (req_offset(req) + req->wb_bytes))
1508 lwb = req_offset(req) + req->wb_bytes;
1509
1510 return lwb;
1511}
1512
1541/* 1513/*
1542 * Set up the argument/result storage required for the RPC call. 1514 * Set up the argument/result storage required for the RPC call.
1543 */ 1515 */
@@ -1557,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
1557 data->inode = inode; 1529 data->inode = inode;
1558 data->cred = first->wb_context->cred; 1530 data->cred = first->wb_context->cred;
1559 data->lseg = lseg; /* reference transferred */ 1531 data->lseg = lseg; /* reference transferred */
1532 /* only set lwb for pnfs commit */
1533 if (lseg)
1534 data->lwb = nfs_get_lwb(&data->pages);
1560 data->mds_ops = &nfs_commit_ops; 1535 data->mds_ops = &nfs_commit_ops;
1561 data->completion_ops = cinfo->completion_ops; 1536 data->completion_ops = cinfo->completion_ops;
1562 data->dreq = cinfo->dreq; 1537 data->dreq = cinfo->dreq;
@@ -1636,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1636 struct nfs_page *req; 1611 struct nfs_page *req;
1637 int status = data->task.tk_status; 1612 int status = data->task.tk_status;
1638 struct nfs_commit_info cinfo; 1613 struct nfs_commit_info cinfo;
1614 struct nfs_server *nfss;
1639 1615
1640 while (!list_empty(&data->pages)) { 1616 while (!list_empty(&data->pages)) {
1641 req = nfs_list_entry(data->pages.next); 1617 req = nfs_list_entry(data->pages.next);
@@ -1669,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1669 next: 1645 next:
1670 nfs_unlock_and_release_request(req); 1646 nfs_unlock_and_release_request(req);
1671 } 1647 }
1648 nfss = NFS_SERVER(data->inode);
1649 if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
1650 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
1651
1672 nfs_init_cinfo(&cinfo, data->inode, data->dreq); 1652 nfs_init_cinfo(&cinfo, data->inode, data->dreq);
1673 if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) 1653 if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
1674 nfs_commit_clear_lock(NFS_I(data->inode)); 1654 nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1778,12 +1758,6 @@ out_mark_dirty:
1778 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1758 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1779 return ret; 1759 return ret;
1780} 1760}
1781#else
1782static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1783{
1784 return 0;
1785}
1786#endif
1787 1761
1788int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1762int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1789{ 1763{
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5180a7ededec..28d649054d5f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -443,22 +443,6 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file)
443} 443}
444 444
445/* 445/*
446 * linux/fs/nfs/xattr.c
447 */
448#ifdef CONFIG_NFS_V3_ACL
449extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
450extern ssize_t nfs3_getxattr(struct dentry *, const char *, void *, size_t);
451extern int nfs3_setxattr(struct dentry *, const char *,
452 const void *, size_t, int);
453extern int nfs3_removexattr (struct dentry *, const char *name);
454#else
455# define nfs3_listxattr NULL
456# define nfs3_getxattr NULL
457# define nfs3_setxattr NULL
458# define nfs3_removexattr NULL
459#endif
460
461/*
462 * linux/fs/nfs/direct.c 446 * linux/fs/nfs/direct.c
463 */ 447 */
464extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t); 448extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
@@ -529,17 +513,9 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
529extern int nfs_wb_all(struct inode *inode); 513extern int nfs_wb_all(struct inode *inode);
530extern int nfs_wb_page(struct inode *inode, struct page* page); 514extern int nfs_wb_page(struct inode *inode, struct page* page);
531extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); 515extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
532#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
533extern int nfs_commit_inode(struct inode *, int); 516extern int nfs_commit_inode(struct inode *, int);
534extern struct nfs_commit_data *nfs_commitdata_alloc(void); 517extern struct nfs_commit_data *nfs_commitdata_alloc(void);
535extern void nfs_commit_free(struct nfs_commit_data *data); 518extern void nfs_commit_free(struct nfs_commit_data *data);
536#else
537static inline int
538nfs_commit_inode(struct inode *inode, int how)
539{
540 return 0;
541}
542#endif
543 519
544static inline int 520static inline int
545nfs_have_writebacks(struct inode *inode) 521nfs_have_writebacks(struct inode *inode)
@@ -557,23 +533,6 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *,
557 struct page *); 533 struct page *);
558 534
559/* 535/*
560 * linux/fs/nfs3proc.c
561 */
562#ifdef CONFIG_NFS_V3_ACL
563extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
564extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
565extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
566 struct posix_acl *dfacl);
567extern const struct xattr_handler *nfs3_xattr_handlers[];
568#else
569static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
570 struct posix_acl *dfacl)
571{
572 return 0;
573}
574#endif /* CONFIG_NFS_V3_ACL */
575
576/*
577 * inline functions 536 * inline functions
578 */ 537 */
579 538
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0040629894df..6951c7d9097d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -252,17 +252,6 @@ struct nfs4_layoutget {
252 gfp_t gfp_flags; 252 gfp_t gfp_flags;
253}; 253};
254 254
255struct nfs4_getdevicelist_args {
256 struct nfs4_sequence_args seq_args;
257 const struct nfs_fh *fh;
258 u32 layoutclass;
259};
260
261struct nfs4_getdevicelist_res {
262 struct nfs4_sequence_res seq_res;
263 struct pnfs_devicelist *devlist;
264};
265
266struct nfs4_getdeviceinfo_args { 255struct nfs4_getdeviceinfo_args {
267 struct nfs4_sequence_args seq_args; 256 struct nfs4_sequence_args seq_args;
268 struct pnfs_device *pdev; 257 struct pnfs_device *pdev;
@@ -279,6 +268,9 @@ struct nfs4_layoutcommit_args {
279 __u64 lastbytewritten; 268 __u64 lastbytewritten;
280 struct inode *inode; 269 struct inode *inode;
281 const u32 *bitmask; 270 const u32 *bitmask;
271 size_t layoutupdate_len;
272 struct page *layoutupdate_page;
273 struct page **layoutupdate_pages;
282}; 274};
283 275
284struct nfs4_layoutcommit_res { 276struct nfs4_layoutcommit_res {
@@ -1328,6 +1320,7 @@ struct nfs_commit_data {
1328 struct pnfs_layout_segment *lseg; 1320 struct pnfs_layout_segment *lseg;
1329 struct nfs_client *ds_clp; /* pNFS data server */ 1321 struct nfs_client *ds_clp; /* pNFS data server */
1330 int ds_commit_index; 1322 int ds_commit_index;
1323 loff_t lwb;
1331 const struct rpc_call_ops *mds_ops; 1324 const struct rpc_call_ops *mds_ops;
1332 const struct nfs_commit_completion_ops *completion_ops; 1325 const struct nfs_commit_completion_ops *completion_ops;
1333 int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data); 1326 int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
@@ -1346,6 +1339,7 @@ struct nfs_unlinkdata {
1346 struct inode *dir; 1339 struct inode *dir;
1347 struct rpc_cred *cred; 1340 struct rpc_cred *cred;
1348 struct nfs_fattr dir_attr; 1341 struct nfs_fattr dir_attr;
1342 long timeout;
1349}; 1343};
1350 1344
1351struct nfs_renamedata { 1345struct nfs_renamedata {
@@ -1359,6 +1353,7 @@ struct nfs_renamedata {
1359 struct dentry *new_dentry; 1353 struct dentry *new_dentry;
1360 struct nfs_fattr new_fattr; 1354 struct nfs_fattr new_fattr;
1361 void (*complete)(struct rpc_task *, struct nfs_renamedata *); 1355 void (*complete)(struct rpc_task *, struct nfs_renamedata *);
1356 long timeout;
1362}; 1357};
1363 1358
1364struct nfs_access_entry; 1359struct nfs_access_entry;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3df8c7db7a4e..2dca0cef3506 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -496,12 +496,14 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
496} 496}
497 497
498/* 498/*
499 * This is exported only for wait_on_page_locked/wait_on_page_writeback. 499 * This is exported only for wait_on_page_locked/wait_on_page_writeback,
500 * Never use this directly! 500 * and for filesystems which need to wait on PG_private.
501 */ 501 */
502extern void wait_on_page_bit(struct page *page, int bit_nr); 502extern void wait_on_page_bit(struct page *page, int bit_nr);
503 503
504extern int wait_on_page_bit_killable(struct page *page, int bit_nr); 504extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
505extern int wait_on_page_bit_killable_timeout(struct page *page,
506 int bit_nr, unsigned long timeout);
505 507
506static inline int wait_on_page_locked_killable(struct page *page) 508static inline int wait_on_page_locked_killable(struct page *page)
507{ 509{
@@ -510,6 +512,12 @@ static inline int wait_on_page_locked_killable(struct page *page)
510 return 0; 512 return 0;
511} 513}
512 514
515extern wait_queue_head_t *page_waitqueue(struct page *page);
516static inline void wake_up_page(struct page *page, int bit)
517{
518 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
519}
520
513/* 521/*
514 * Wait for a page to be unlocked. 522 * Wait for a page to be unlocked.
515 * 523 *
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fcbfe8783243..cf391eef2e6d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -357,6 +357,7 @@ int xs_swapper(struct rpc_xprt *xprt, int enable);
357#define XPRT_CONNECTION_ABORT (7) 357#define XPRT_CONNECTION_ABORT (7)
358#define XPRT_CONNECTION_CLOSE (8) 358#define XPRT_CONNECTION_CLOSE (8)
359#define XPRT_CONGESTED (9) 359#define XPRT_CONGESTED (9)
360#define XPRT_CONNECTION_REUSE (10)
360 361
361static inline void xprt_set_connected(struct rpc_xprt *xprt) 362static inline void xprt_set_connected(struct rpc_xprt *xprt)
362{ 363{
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6fb1ba5f9b2f..80115bf88671 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -25,7 +25,7 @@ struct wait_bit_key {
25 void *flags; 25 void *flags;
26 int bit_nr; 26 int bit_nr;
27#define WAIT_ATOMIC_T_BIT_NR -1 27#define WAIT_ATOMIC_T_BIT_NR -1
28 unsigned long private; 28 unsigned long timeout;
29}; 29};
30 30
31struct wait_bit_queue { 31struct wait_bit_queue {
@@ -154,6 +154,7 @@ int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_ac
154void wake_up_bit(void *, int); 154void wake_up_bit(void *, int);
155void wake_up_atomic_t(atomic_t *); 155void wake_up_atomic_t(atomic_t *);
156int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); 156int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
157int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long);
157int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned); 158int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned);
158int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); 159int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
159wait_queue_head_t *bit_waitqueue(void *, int); 160wait_queue_head_t *bit_waitqueue(void *, int);
@@ -859,6 +860,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
859 860
860extern int bit_wait(struct wait_bit_key *); 861extern int bit_wait(struct wait_bit_key *);
861extern int bit_wait_io(struct wait_bit_key *); 862extern int bit_wait_io(struct wait_bit_key *);
863extern int bit_wait_timeout(struct wait_bit_key *);
864extern int bit_wait_io_timeout(struct wait_bit_key *);
862 865
863/** 866/**
864 * wait_on_bit - wait for a bit to be cleared 867 * wait_on_bit - wait for a bit to be cleared
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..5a62915f47a8 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
343} 343}
344EXPORT_SYMBOL(out_of_line_wait_on_bit); 344EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 345
346int __sched out_of_line_wait_on_bit_timeout(
347 void *word, int bit, wait_bit_action_f *action,
348 unsigned mode, unsigned long timeout)
349{
350 wait_queue_head_t *wq = bit_waitqueue(word, bit);
351 DEFINE_WAIT_BIT(wait, word, bit);
352
353 wait.key.timeout = jiffies + timeout;
354 return __wait_on_bit(wq, &wait, action, mode);
355}
356EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
357
346int __sched 358int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 359__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 wait_bit_action_f *action, unsigned mode) 360 wait_bit_action_f *action, unsigned mode)
@@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
520 return 0; 532 return 0;
521} 533}
522EXPORT_SYMBOL(bit_wait_io); 534EXPORT_SYMBOL(bit_wait_io);
535
536__sched int bit_wait_timeout(struct wait_bit_key *word)
537{
538 unsigned long now = ACCESS_ONCE(jiffies);
539 if (signal_pending_state(current->state, current))
540 return 1;
541 if (time_after_eq(now, word->timeout))
542 return -EAGAIN;
543 schedule_timeout(word->timeout - now);
544 return 0;
545}
546EXPORT_SYMBOL_GPL(bit_wait_timeout);
547
548__sched int bit_wait_io_timeout(struct wait_bit_key *word)
549{
550 unsigned long now = ACCESS_ONCE(jiffies);
551 if (signal_pending_state(current->state, current))
552 return 1;
553 if (time_after_eq(now, word->timeout))
554 return -EAGAIN;
555 io_schedule_timeout(word->timeout - now);
556 return 0;
557}
558EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/mm/filemap.c b/mm/filemap.c
index 90effcdf948d..b9b1413080be 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc);
670 * at a cost of "thundering herd" phenomena during rare hash 670 * at a cost of "thundering herd" phenomena during rare hash
671 * collisions. 671 * collisions.
672 */ 672 */
673static wait_queue_head_t *page_waitqueue(struct page *page) 673wait_queue_head_t *page_waitqueue(struct page *page)
674{ 674{
675 const struct zone *zone = page_zone(page); 675 const struct zone *zone = page_zone(page);
676 676
677 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 677 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
678} 678}
679 679EXPORT_SYMBOL(page_waitqueue);
680static inline void wake_up_page(struct page *page, int bit)
681{
682 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
683}
684 680
685void wait_on_page_bit(struct page *page, int bit_nr) 681void wait_on_page_bit(struct page *page, int bit_nr)
686{ 682{
@@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
703 bit_wait_io, TASK_KILLABLE); 699 bit_wait_io, TASK_KILLABLE);
704} 700}
705 701
702int wait_on_page_bit_killable_timeout(struct page *page,
703 int bit_nr, unsigned long timeout)
704{
705 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
706
707 wait.key.timeout = jiffies + timeout;
708 if (!test_bit(bit_nr, &page->flags))
709 return 0;
710 return __wait_on_bit(page_waitqueue(page), &wait,
711 bit_wait_io_timeout, TASK_KILLABLE);
712}
713EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
714
706/** 715/**
707 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 716 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
708 * @page: Page defining the wait queue of interest 717 * @page: Page defining the wait queue of interest
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e0b94ce4c4e6..9acd6ce88db7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1916,6 +1916,7 @@ call_transmit_status(struct rpc_task *task)
1916 case -EHOSTDOWN: 1916 case -EHOSTDOWN:
1917 case -EHOSTUNREACH: 1917 case -EHOSTUNREACH:
1918 case -ENETUNREACH: 1918 case -ENETUNREACH:
1919 case -EPERM:
1919 if (RPC_IS_SOFTCONN(task)) { 1920 if (RPC_IS_SOFTCONN(task)) {
1920 xprt_end_transmit(task); 1921 xprt_end_transmit(task);
1921 rpc_exit(task, task->tk_status); 1922 rpc_exit(task, task->tk_status);
@@ -2021,6 +2022,7 @@ call_status(struct rpc_task *task)
2021 case -EHOSTDOWN: 2022 case -EHOSTDOWN:
2022 case -EHOSTUNREACH: 2023 case -EHOSTUNREACH:
2023 case -ENETUNREACH: 2024 case -ENETUNREACH:
2025 case -EPERM:
2024 if (RPC_IS_SOFTCONN(task)) { 2026 if (RPC_IS_SOFTCONN(task)) {
2025 rpc_exit(task, status); 2027 rpc_exit(task, status);
2026 break; 2028 break;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9358c79fd589..fe3441abdbe5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -821,9 +821,7 @@ void rpc_execute(struct rpc_task *task)
821 821
822static void rpc_async_schedule(struct work_struct *work) 822static void rpc_async_schedule(struct work_struct *work)
823{ 823{
824 current->flags |= PF_FSTRANS;
825 __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); 824 __rpc_execute(container_of(work, struct rpc_task, u.tk_work));
826 current->flags &= ~PF_FSTRANS;
827} 825}
828 826
829/** 827/**
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2faac4940563..6a4615dd0261 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -205,7 +205,6 @@ xprt_rdma_connect_worker(struct work_struct *work)
205 struct rpc_xprt *xprt = &r_xprt->xprt; 205 struct rpc_xprt *xprt = &r_xprt->xprt;
206 int rc = 0; 206 int rc = 0;
207 207
208 current->flags |= PF_FSTRANS;
209 xprt_clear_connected(xprt); 208 xprt_clear_connected(xprt);
210 209
211 dprintk("RPC: %s: %sconnect\n", __func__, 210 dprintk("RPC: %s: %sconnect\n", __func__,
@@ -216,7 +215,6 @@ xprt_rdma_connect_worker(struct work_struct *work)
216 215
217 dprintk("RPC: %s: exit\n", __func__); 216 dprintk("RPC: %s: exit\n", __func__);
218 xprt_clear_connecting(xprt); 217 xprt_clear_connecting(xprt);
219 current->flags &= ~PF_FSTRANS;
220} 218}
221 219
222/* 220/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 43cd89eacfab..3b305ab17afe 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -399,13 +399,13 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
399 return kernel_sendmsg(sock, &msg, NULL, 0, 0); 399 return kernel_sendmsg(sock, &msg, NULL, 0, 0);
400} 400}
401 401
402static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy) 402static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy, int *sent_p)
403{ 403{
404 ssize_t (*do_sendpage)(struct socket *sock, struct page *page, 404 ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
405 int offset, size_t size, int flags); 405 int offset, size_t size, int flags);
406 struct page **ppage; 406 struct page **ppage;
407 unsigned int remainder; 407 unsigned int remainder;
408 int err, sent = 0; 408 int err;
409 409
410 remainder = xdr->page_len - base; 410 remainder = xdr->page_len - base;
411 base += xdr->page_base; 411 base += xdr->page_base;
@@ -424,15 +424,15 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
424 err = do_sendpage(sock, *ppage, base, len, flags); 424 err = do_sendpage(sock, *ppage, base, len, flags);
425 if (remainder == 0 || err != len) 425 if (remainder == 0 || err != len)
426 break; 426 break;
427 sent += err; 427 *sent_p += err;
428 ppage++; 428 ppage++;
429 base = 0; 429 base = 0;
430 } 430 }
431 if (sent == 0) 431 if (err > 0) {
432 return err; 432 *sent_p += err;
433 if (err > 0) 433 err = 0;
434 sent += err; 434 }
435 return sent; 435 return err;
436} 436}
437 437
438/** 438/**
@@ -443,12 +443,14 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
443 * @xdr: buffer containing this request 443 * @xdr: buffer containing this request
444 * @base: starting position in the buffer 444 * @base: starting position in the buffer
445 * @zerocopy: true if it is safe to use sendpage() 445 * @zerocopy: true if it is safe to use sendpage()
446 * @sent_p: return the total number of bytes successfully queued for sending
446 * 447 *
447 */ 448 */
448static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy) 449static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy, int *sent_p)
449{ 450{
450 unsigned int remainder = xdr->len - base; 451 unsigned int remainder = xdr->len - base;
451 int err, sent = 0; 452 int err = 0;
453 int sent = 0;
452 454
453 if (unlikely(!sock)) 455 if (unlikely(!sock))
454 return -ENOTSOCK; 456 return -ENOTSOCK;
@@ -465,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
465 err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0); 467 err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
466 if (remainder == 0 || err != len) 468 if (remainder == 0 || err != len)
467 goto out; 469 goto out;
468 sent += err; 470 *sent_p += err;
469 base = 0; 471 base = 0;
470 } else 472 } else
471 base -= xdr->head[0].iov_len; 473 base -= xdr->head[0].iov_len;
@@ -473,23 +475,23 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
473 if (base < xdr->page_len) { 475 if (base < xdr->page_len) {
474 unsigned int len = xdr->page_len - base; 476 unsigned int len = xdr->page_len - base;
475 remainder -= len; 477 remainder -= len;
476 err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy); 478 err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy, &sent);
477 if (remainder == 0 || err != len) 479 *sent_p += sent;
480 if (remainder == 0 || sent != len)
478 goto out; 481 goto out;
479 sent += err;
480 base = 0; 482 base = 0;
481 } else 483 } else
482 base -= xdr->page_len; 484 base -= xdr->page_len;
483 485
484 if (base >= xdr->tail[0].iov_len) 486 if (base >= xdr->tail[0].iov_len)
485 return sent; 487 return 0;
486 err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0); 488 err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0);
487out: 489out:
488 if (sent == 0) 490 if (err > 0) {
489 return err; 491 *sent_p += err;
490 if (err > 0) 492 err = 0;
491 sent += err; 493 }
492 return sent; 494 return err;
493} 495}
494 496
495static void xs_nospace_callback(struct rpc_task *task) 497static void xs_nospace_callback(struct rpc_task *task)
@@ -573,19 +575,20 @@ static int xs_local_send_request(struct rpc_task *task)
573 container_of(xprt, struct sock_xprt, xprt); 575 container_of(xprt, struct sock_xprt, xprt);
574 struct xdr_buf *xdr = &req->rq_snd_buf; 576 struct xdr_buf *xdr = &req->rq_snd_buf;
575 int status; 577 int status;
578 int sent = 0;
576 579
577 xs_encode_stream_record_marker(&req->rq_snd_buf); 580 xs_encode_stream_record_marker(&req->rq_snd_buf);
578 581
579 xs_pktdump("packet data:", 582 xs_pktdump("packet data:",
580 req->rq_svec->iov_base, req->rq_svec->iov_len); 583 req->rq_svec->iov_base, req->rq_svec->iov_len);
581 584
582 status = xs_sendpages(transport->sock, NULL, 0, 585 status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
583 xdr, req->rq_bytes_sent, true); 586 true, &sent);
584 dprintk("RPC: %s(%u) = %d\n", 587 dprintk("RPC: %s(%u) = %d\n",
585 __func__, xdr->len - req->rq_bytes_sent, status); 588 __func__, xdr->len - req->rq_bytes_sent, status);
586 if (likely(status >= 0)) { 589 if (likely(sent > 0) || status == 0) {
587 req->rq_bytes_sent += status; 590 req->rq_bytes_sent += sent;
588 req->rq_xmit_bytes_sent += status; 591 req->rq_xmit_bytes_sent += sent;
589 if (likely(req->rq_bytes_sent >= req->rq_slen)) { 592 if (likely(req->rq_bytes_sent >= req->rq_slen)) {
590 req->rq_bytes_sent = 0; 593 req->rq_bytes_sent = 0;
591 return 0; 594 return 0;
@@ -626,6 +629,7 @@ static int xs_udp_send_request(struct rpc_task *task)
626 struct rpc_xprt *xprt = req->rq_xprt; 629 struct rpc_xprt *xprt = req->rq_xprt;
627 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 630 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
628 struct xdr_buf *xdr = &req->rq_snd_buf; 631 struct xdr_buf *xdr = &req->rq_snd_buf;
632 int sent = 0;
629 int status; 633 int status;
630 634
631 xs_pktdump("packet data:", 635 xs_pktdump("packet data:",
@@ -634,22 +638,25 @@ static int xs_udp_send_request(struct rpc_task *task)
634 638
635 if (!xprt_bound(xprt)) 639 if (!xprt_bound(xprt))
636 return -ENOTCONN; 640 return -ENOTCONN;
637 status = xs_sendpages(transport->sock, 641 status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
638 xs_addr(xprt), 642 xdr, req->rq_bytes_sent, true, &sent);
639 xprt->addrlen, xdr,
640 req->rq_bytes_sent, true);
641 643
642 dprintk("RPC: xs_udp_send_request(%u) = %d\n", 644 dprintk("RPC: xs_udp_send_request(%u) = %d\n",
643 xdr->len - req->rq_bytes_sent, status); 645 xdr->len - req->rq_bytes_sent, status);
644 646
645 if (status >= 0) { 647 /* firewall is blocking us, don't return -EAGAIN or we end up looping */
646 req->rq_xmit_bytes_sent += status; 648 if (status == -EPERM)
647 if (status >= req->rq_slen) 649 goto process_status;
650
651 if (sent > 0 || status == 0) {
652 req->rq_xmit_bytes_sent += sent;
653 if (sent >= req->rq_slen)
648 return 0; 654 return 0;
649 /* Still some bytes left; set up for a retry later. */ 655 /* Still some bytes left; set up for a retry later. */
650 status = -EAGAIN; 656 status = -EAGAIN;
651 } 657 }
652 658
659process_status:
653 switch (status) { 660 switch (status) {
654 case -ENOTSOCK: 661 case -ENOTSOCK:
655 status = -ENOTCONN; 662 status = -ENOTCONN;
@@ -665,6 +672,7 @@ static int xs_udp_send_request(struct rpc_task *task)
665 case -ENOBUFS: 672 case -ENOBUFS:
666 case -EPIPE: 673 case -EPIPE:
667 case -ECONNREFUSED: 674 case -ECONNREFUSED:
675 case -EPERM:
668 /* When the server has died, an ICMP port unreachable message 676 /* When the server has died, an ICMP port unreachable message
669 * prompts ECONNREFUSED. */ 677 * prompts ECONNREFUSED. */
670 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); 678 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
@@ -713,6 +721,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
713 struct xdr_buf *xdr = &req->rq_snd_buf; 721 struct xdr_buf *xdr = &req->rq_snd_buf;
714 bool zerocopy = true; 722 bool zerocopy = true;
715 int status; 723 int status;
724 int sent;
716 725
717 xs_encode_stream_record_marker(&req->rq_snd_buf); 726 xs_encode_stream_record_marker(&req->rq_snd_buf);
718 727
@@ -730,26 +739,26 @@ static int xs_tcp_send_request(struct rpc_task *task)
730 * to cope with writespace callbacks arriving _after_ we have 739 * to cope with writespace callbacks arriving _after_ we have
731 * called sendmsg(). */ 740 * called sendmsg(). */
732 while (1) { 741 while (1) {
733 status = xs_sendpages(transport->sock, 742 sent = 0;
734 NULL, 0, xdr, req->rq_bytes_sent, 743 status = xs_sendpages(transport->sock, NULL, 0, xdr,
735 zerocopy); 744 req->rq_bytes_sent, zerocopy, &sent);
736 745
737 dprintk("RPC: xs_tcp_send_request(%u) = %d\n", 746 dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
738 xdr->len - req->rq_bytes_sent, status); 747 xdr->len - req->rq_bytes_sent, status);
739 748
740 if (unlikely(status < 0)) 749 if (unlikely(sent == 0 && status < 0))
741 break; 750 break;
742 751
743 /* If we've sent the entire packet, immediately 752 /* If we've sent the entire packet, immediately
744 * reset the count of bytes sent. */ 753 * reset the count of bytes sent. */
745 req->rq_bytes_sent += status; 754 req->rq_bytes_sent += sent;
746 req->rq_xmit_bytes_sent += status; 755 req->rq_xmit_bytes_sent += sent;
747 if (likely(req->rq_bytes_sent >= req->rq_slen)) { 756 if (likely(req->rq_bytes_sent >= req->rq_slen)) {
748 req->rq_bytes_sent = 0; 757 req->rq_bytes_sent = 0;
749 return 0; 758 return 0;
750 } 759 }
751 760
752 if (status != 0) 761 if (sent != 0)
753 continue; 762 continue;
754 status = -EAGAIN; 763 status = -EAGAIN;
755 break; 764 break;
@@ -845,6 +854,8 @@ static void xs_error_report(struct sock *sk)
845 dprintk("RPC: xs_error_report client %p, error=%d...\n", 854 dprintk("RPC: xs_error_report client %p, error=%d...\n",
846 xprt, -err); 855 xprt, -err);
847 trace_rpc_socket_error(xprt, sk->sk_socket, err); 856 trace_rpc_socket_error(xprt, sk->sk_socket, err);
857 if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state))
858 goto out;
848 xprt_wake_pending_tasks(xprt, err); 859 xprt_wake_pending_tasks(xprt, err);
849 out: 860 out:
850 read_unlock_bh(&sk->sk_callback_lock); 861 read_unlock_bh(&sk->sk_callback_lock);
@@ -1746,13 +1757,29 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
1746 unsigned short port = xs_get_srcport(transport); 1757 unsigned short port = xs_get_srcport(transport);
1747 unsigned short last; 1758 unsigned short last;
1748 1759
1760 /*
1761 * If we are asking for any ephemeral port (i.e. port == 0 &&
1762 * transport->xprt.resvport == 0), don't bind. Let the local
1763 * port selection happen implicitly when the socket is used
1764 * (for example at connect time).
1765 *
1766 * This ensures that we can continue to establish TCP
1767 * connections even when all local ephemeral ports are already
1768 * a part of some TCP connection. This makes no difference
1769 * for UDP sockets, but also doens't harm them.
1770 *
1771 * If we're asking for any reserved port (i.e. port == 0 &&
1772 * transport->xprt.resvport == 1) xs_get_srcport above will
1773 * ensure that port is non-zero and we will bind as needed.
1774 */
1775 if (port == 0)
1776 return 0;
1777
1749 memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); 1778 memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
1750 do { 1779 do {
1751 rpc_set_port((struct sockaddr *)&myaddr, port); 1780 rpc_set_port((struct sockaddr *)&myaddr, port);
1752 err = kernel_bind(sock, (struct sockaddr *)&myaddr, 1781 err = kernel_bind(sock, (struct sockaddr *)&myaddr,
1753 transport->xprt.addrlen); 1782 transport->xprt.addrlen);
1754 if (port == 0)
1755 break;
1756 if (err == 0) { 1783 if (err == 0) {
1757 transport->srcport = port; 1784 transport->srcport = port;
1758 break; 1785 break;
@@ -1927,8 +1954,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
1927 struct socket *sock; 1954 struct socket *sock;
1928 int status = -EIO; 1955 int status = -EIO;
1929 1956
1930 current->flags |= PF_FSTRANS;
1931
1932 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); 1957 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1933 status = __sock_create(xprt->xprt_net, AF_LOCAL, 1958 status = __sock_create(xprt->xprt_net, AF_LOCAL,
1934 SOCK_STREAM, 0, &sock, 1); 1959 SOCK_STREAM, 0, &sock, 1);
@@ -1968,7 +1993,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
1968out: 1993out:
1969 xprt_clear_connecting(xprt); 1994 xprt_clear_connecting(xprt);
1970 xprt_wake_pending_tasks(xprt, status); 1995 xprt_wake_pending_tasks(xprt, status);
1971 current->flags &= ~PF_FSTRANS;
1972 return status; 1996 return status;
1973} 1997}
1974 1998
@@ -2071,8 +2095,6 @@ static void xs_udp_setup_socket(struct work_struct *work)
2071 struct socket *sock = transport->sock; 2095 struct socket *sock = transport->sock;
2072 int status = -EIO; 2096 int status = -EIO;
2073 2097
2074 current->flags |= PF_FSTRANS;
2075
2076 /* Start by resetting any existing state */ 2098 /* Start by resetting any existing state */
2077 xs_reset_transport(transport); 2099 xs_reset_transport(transport);
2078 sock = xs_create_sock(xprt, transport, 2100 sock = xs_create_sock(xprt, transport,
@@ -2092,7 +2114,6 @@ static void xs_udp_setup_socket(struct work_struct *work)
2092out: 2114out:
2093 xprt_clear_connecting(xprt); 2115 xprt_clear_connecting(xprt);
2094 xprt_wake_pending_tasks(xprt, status); 2116 xprt_wake_pending_tasks(xprt, status);
2095 current->flags &= ~PF_FSTRANS;
2096} 2117}
2097 2118
2098/* 2119/*
@@ -2229,8 +2250,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2229 struct rpc_xprt *xprt = &transport->xprt; 2250 struct rpc_xprt *xprt = &transport->xprt;
2230 int status = -EIO; 2251 int status = -EIO;
2231 2252
2232 current->flags |= PF_FSTRANS;
2233
2234 if (!sock) { 2253 if (!sock) {
2235 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); 2254 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
2236 sock = xs_create_sock(xprt, transport, 2255 sock = xs_create_sock(xprt, transport,
@@ -2245,7 +2264,9 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2245 abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, 2264 abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
2246 &xprt->state); 2265 &xprt->state);
2247 /* "close" the socket, preserving the local port */ 2266 /* "close" the socket, preserving the local port */
2267 set_bit(XPRT_CONNECTION_REUSE, &xprt->state);
2248 xs_tcp_reuse_connection(transport); 2268 xs_tcp_reuse_connection(transport);
2269 clear_bit(XPRT_CONNECTION_REUSE, &xprt->state);
2249 2270
2250 if (abort_and_exit) 2271 if (abort_and_exit)
2251 goto out_eagain; 2272 goto out_eagain;
@@ -2276,7 +2297,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2276 case -EINPROGRESS: 2297 case -EINPROGRESS:
2277 case -EALREADY: 2298 case -EALREADY:
2278 xprt_clear_connecting(xprt); 2299 xprt_clear_connecting(xprt);
2279 current->flags &= ~PF_FSTRANS;
2280 return; 2300 return;
2281 case -EINVAL: 2301 case -EINVAL:
2282 /* Happens, for instance, if the user specified a link 2302 /* Happens, for instance, if the user specified a link
@@ -2294,7 +2314,6 @@ out_eagain:
2294out: 2314out:
2295 xprt_clear_connecting(xprt); 2315 xprt_clear_connecting(xprt);
2296 xprt_wake_pending_tasks(xprt, status); 2316 xprt_wake_pending_tasks(xprt, status);
2297 current->flags &= ~PF_FSTRANS;
2298} 2317}
2299 2318
2300/** 2319/**