diff options
-rw-r--r-- | fs/nfs/Kconfig | 8 | ||||
-rw-r--r-- | fs/nfs/Makefile | 1 | ||||
-rw-r--r-- | fs/nfs/blocklayout/Makefile | 5 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.c | 1019 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.h | 207 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayoutdev.c | 410 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayoutdm.c | 111 | ||||
-rw-r--r-- | fs/nfs/blocklayout/extents.c | 935 | ||||
-rw-r--r-- | fs/nfs/client.c | 11 | ||||
-rw-r--r-- | fs/nfs/dir.c | 57 | ||||
-rw-r--r-- | fs/nfs/nfs4_fs.h | 2 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayout.c | 2 | ||||
-rw-r--r-- | fs/nfs/nfs4proc.c | 62 | ||||
-rw-r--r-- | fs/nfs/nfs4xdr.c | 233 | ||||
-rw-r--r-- | fs/nfs/pnfs.c | 86 | ||||
-rw-r--r-- | fs/nfs/pnfs.h | 28 | ||||
-rw-r--r-- | include/linux/nfs.h | 2 | ||||
-rw-r--r-- | include/linux/nfs4.h | 1 | ||||
-rw-r--r-- | include/linux/nfs_fs.h | 3 | ||||
-rw-r--r-- | include/linux/nfs_fs_sb.h | 4 | ||||
-rw-r--r-- | include/linux/nfs_xdr.h | 17 |
21 files changed, 3113 insertions, 91 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 2cde5d954750..be020771c6b4 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig | |||
@@ -79,15 +79,21 @@ config NFS_V4_1 | |||
79 | depends on NFS_FS && NFS_V4 && EXPERIMENTAL | 79 | depends on NFS_FS && NFS_V4 && EXPERIMENTAL |
80 | select SUNRPC_BACKCHANNEL | 80 | select SUNRPC_BACKCHANNEL |
81 | select PNFS_FILE_LAYOUT | 81 | select PNFS_FILE_LAYOUT |
82 | select PNFS_BLOCK | ||
83 | select MD | ||
84 | select BLK_DEV_DM | ||
82 | help | 85 | help |
83 | This option enables support for minor version 1 of the NFSv4 protocol | 86 | This option enables support for minor version 1 of the NFSv4 protocol |
84 | (RFC 5661) in the kernel's NFS client. | 87 | (RFC 5661 and RFC 5663) in the kernel's NFS client. |
85 | 88 | ||
86 | If unsure, say N. | 89 | If unsure, say N. |
87 | 90 | ||
88 | config PNFS_FILE_LAYOUT | 91 | config PNFS_FILE_LAYOUT |
89 | tristate | 92 | tristate |
90 | 93 | ||
94 | config PNFS_BLOCK | ||
95 | tristate | ||
96 | |||
91 | config PNFS_OBJLAYOUT | 97 | config PNFS_OBJLAYOUT |
92 | tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" | 98 | tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" |
93 | depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD | 99 | depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD |
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6a34f7dd0e6f..b58613d0abb3 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile | |||
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o | |||
23 | nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o | 23 | nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o |
24 | 24 | ||
25 | obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ | 25 | obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ |
26 | obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ | ||
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 000000000000..d5815505c020 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | # | ||
2 | # Makefile for the pNFS block layout driver kernel module | ||
3 | # | ||
4 | obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o | ||
5 | blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o | ||
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 000000000000..e56564d2ef95 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c | |||
@@ -0,0 +1,1019 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayout.c | ||
3 | * | ||
4 | * Module for the NFSv4.1 pNFS block layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | |||
33 | #include <linux/module.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/mount.h> | ||
36 | #include <linux/namei.h> | ||
37 | #include <linux/bio.h> /* struct bio */ | ||
38 | #include <linux/buffer_head.h> /* various write calls */ | ||
39 | |||
40 | #include "blocklayout.h" | ||
41 | |||
42 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
43 | |||
44 | MODULE_LICENSE("GPL"); | ||
45 | MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); | ||
46 | MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); | ||
47 | |||
48 | struct dentry *bl_device_pipe; | ||
49 | wait_queue_head_t bl_wq; | ||
50 | |||
51 | static void print_page(struct page *page) | ||
52 | { | ||
53 | dprintk("PRINTPAGE page %p\n", page); | ||
54 | dprintk(" PagePrivate %d\n", PagePrivate(page)); | ||
55 | dprintk(" PageUptodate %d\n", PageUptodate(page)); | ||
56 | dprintk(" PageError %d\n", PageError(page)); | ||
57 | dprintk(" PageDirty %d\n", PageDirty(page)); | ||
58 | dprintk(" PageReferenced %d\n", PageReferenced(page)); | ||
59 | dprintk(" PageLocked %d\n", PageLocked(page)); | ||
60 | dprintk(" PageWriteback %d\n", PageWriteback(page)); | ||
61 | dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); | ||
62 | dprintk("\n"); | ||
63 | } | ||
64 | |||
65 | /* Given the be associated with isect, determine if page data needs to be | ||
66 | * initialized. | ||
67 | */ | ||
68 | static int is_hole(struct pnfs_block_extent *be, sector_t isect) | ||
69 | { | ||
70 | if (be->be_state == PNFS_BLOCK_NONE_DATA) | ||
71 | return 1; | ||
72 | else if (be->be_state != PNFS_BLOCK_INVALID_DATA) | ||
73 | return 0; | ||
74 | else | ||
75 | return !bl_is_sector_init(be->be_inval, isect); | ||
76 | } | ||
77 | |||
78 | /* Given the be associated with isect, determine if page data can be | ||
79 | * written to disk. | ||
80 | */ | ||
81 | static int is_writable(struct pnfs_block_extent *be, sector_t isect) | ||
82 | { | ||
83 | return (be->be_state == PNFS_BLOCK_READWRITE_DATA || | ||
84 | be->be_state == PNFS_BLOCK_INVALID_DATA); | ||
85 | } | ||
86 | |||
87 | /* The data we are handed might be spread across several bios. We need | ||
88 | * to track when the last one is finished. | ||
89 | */ | ||
90 | struct parallel_io { | ||
91 | struct kref refcnt; | ||
92 | struct rpc_call_ops call_ops; | ||
93 | void (*pnfs_callback) (void *data); | ||
94 | void *data; | ||
95 | }; | ||
96 | |||
97 | static inline struct parallel_io *alloc_parallel(void *data) | ||
98 | { | ||
99 | struct parallel_io *rv; | ||
100 | |||
101 | rv = kmalloc(sizeof(*rv), GFP_NOFS); | ||
102 | if (rv) { | ||
103 | rv->data = data; | ||
104 | kref_init(&rv->refcnt); | ||
105 | } | ||
106 | return rv; | ||
107 | } | ||
108 | |||
109 | static inline void get_parallel(struct parallel_io *p) | ||
110 | { | ||
111 | kref_get(&p->refcnt); | ||
112 | } | ||
113 | |||
114 | static void destroy_parallel(struct kref *kref) | ||
115 | { | ||
116 | struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); | ||
117 | |||
118 | dprintk("%s enter\n", __func__); | ||
119 | p->pnfs_callback(p->data); | ||
120 | kfree(p); | ||
121 | } | ||
122 | |||
123 | static inline void put_parallel(struct parallel_io *p) | ||
124 | { | ||
125 | kref_put(&p->refcnt, destroy_parallel); | ||
126 | } | ||
127 | |||
128 | static struct bio * | ||
129 | bl_submit_bio(int rw, struct bio *bio) | ||
130 | { | ||
131 | if (bio) { | ||
132 | get_parallel(bio->bi_private); | ||
133 | dprintk("%s submitting %s bio %u@%llu\n", __func__, | ||
134 | rw == READ ? "read" : "write", | ||
135 | bio->bi_size, (unsigned long long)bio->bi_sector); | ||
136 | submit_bio(rw, bio); | ||
137 | } | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | static struct bio *bl_alloc_init_bio(int npg, sector_t isect, | ||
142 | struct pnfs_block_extent *be, | ||
143 | void (*end_io)(struct bio *, int err), | ||
144 | struct parallel_io *par) | ||
145 | { | ||
146 | struct bio *bio; | ||
147 | |||
148 | bio = bio_alloc(GFP_NOIO, npg); | ||
149 | if (!bio) | ||
150 | return NULL; | ||
151 | |||
152 | bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; | ||
153 | bio->bi_bdev = be->be_mdev; | ||
154 | bio->bi_end_io = end_io; | ||
155 | bio->bi_private = par; | ||
156 | return bio; | ||
157 | } | ||
158 | |||
159 | static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, | ||
160 | sector_t isect, struct page *page, | ||
161 | struct pnfs_block_extent *be, | ||
162 | void (*end_io)(struct bio *, int err), | ||
163 | struct parallel_io *par) | ||
164 | { | ||
165 | retry: | ||
166 | if (!bio) { | ||
167 | bio = bl_alloc_init_bio(npg, isect, be, end_io, par); | ||
168 | if (!bio) | ||
169 | return ERR_PTR(-ENOMEM); | ||
170 | } | ||
171 | if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { | ||
172 | bio = bl_submit_bio(rw, bio); | ||
173 | goto retry; | ||
174 | } | ||
175 | return bio; | ||
176 | } | ||
177 | |||
178 | static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) | ||
179 | { | ||
180 | if (lseg->pls_range.iomode == IOMODE_RW) { | ||
181 | dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); | ||
182 | set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); | ||
183 | } else { | ||
184 | dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); | ||
185 | set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); | ||
186 | } | ||
187 | } | ||
188 | |||
189 | /* This is basically copied from mpage_end_io_read */ | ||
190 | static void bl_end_io_read(struct bio *bio, int err) | ||
191 | { | ||
192 | struct parallel_io *par = bio->bi_private; | ||
193 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
194 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
195 | struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; | ||
196 | |||
197 | do { | ||
198 | struct page *page = bvec->bv_page; | ||
199 | |||
200 | if (--bvec >= bio->bi_io_vec) | ||
201 | prefetchw(&bvec->bv_page->flags); | ||
202 | if (uptodate) | ||
203 | SetPageUptodate(page); | ||
204 | } while (bvec >= bio->bi_io_vec); | ||
205 | if (!uptodate) { | ||
206 | if (!rdata->pnfs_error) | ||
207 | rdata->pnfs_error = -EIO; | ||
208 | bl_set_lo_fail(rdata->lseg); | ||
209 | } | ||
210 | bio_put(bio); | ||
211 | put_parallel(par); | ||
212 | } | ||
213 | |||
214 | static void bl_read_cleanup(struct work_struct *work) | ||
215 | { | ||
216 | struct rpc_task *task; | ||
217 | struct nfs_read_data *rdata; | ||
218 | dprintk("%s enter\n", __func__); | ||
219 | task = container_of(work, struct rpc_task, u.tk_work); | ||
220 | rdata = container_of(task, struct nfs_read_data, task); | ||
221 | pnfs_ld_read_done(rdata); | ||
222 | } | ||
223 | |||
224 | static void | ||
225 | bl_end_par_io_read(void *data) | ||
226 | { | ||
227 | struct nfs_read_data *rdata = data; | ||
228 | |||
229 | INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); | ||
230 | schedule_work(&rdata->task.u.tk_work); | ||
231 | } | ||
232 | |||
233 | /* We don't want normal .rpc_call_done callback used, so we replace it | ||
234 | * with this stub. | ||
235 | */ | ||
236 | static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) | ||
237 | { | ||
238 | return; | ||
239 | } | ||
240 | |||
241 | static enum pnfs_try_status | ||
242 | bl_read_pagelist(struct nfs_read_data *rdata) | ||
243 | { | ||
244 | int i, hole; | ||
245 | struct bio *bio = NULL; | ||
246 | struct pnfs_block_extent *be = NULL, *cow_read = NULL; | ||
247 | sector_t isect, extent_length = 0; | ||
248 | struct parallel_io *par; | ||
249 | loff_t f_offset = rdata->args.offset; | ||
250 | size_t count = rdata->args.count; | ||
251 | struct page **pages = rdata->args.pages; | ||
252 | int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; | ||
253 | |||
254 | dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, | ||
255 | rdata->npages, f_offset, count); | ||
256 | |||
257 | par = alloc_parallel(rdata); | ||
258 | if (!par) | ||
259 | goto use_mds; | ||
260 | par->call_ops = *rdata->mds_ops; | ||
261 | par->call_ops.rpc_call_done = bl_rpc_do_nothing; | ||
262 | par->pnfs_callback = bl_end_par_io_read; | ||
263 | /* At this point, we can no longer jump to use_mds */ | ||
264 | |||
265 | isect = (sector_t) (f_offset >> SECTOR_SHIFT); | ||
266 | /* Code assumes extents are page-aligned */ | ||
267 | for (i = pg_index; i < rdata->npages; i++) { | ||
268 | if (!extent_length) { | ||
269 | /* We've used up the previous extent */ | ||
270 | bl_put_extent(be); | ||
271 | bl_put_extent(cow_read); | ||
272 | bio = bl_submit_bio(READ, bio); | ||
273 | /* Get the next one */ | ||
274 | be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), | ||
275 | isect, &cow_read); | ||
276 | if (!be) { | ||
277 | rdata->pnfs_error = -EIO; | ||
278 | goto out; | ||
279 | } | ||
280 | extent_length = be->be_length - | ||
281 | (isect - be->be_f_offset); | ||
282 | if (cow_read) { | ||
283 | sector_t cow_length = cow_read->be_length - | ||
284 | (isect - cow_read->be_f_offset); | ||
285 | extent_length = min(extent_length, cow_length); | ||
286 | } | ||
287 | } | ||
288 | hole = is_hole(be, isect); | ||
289 | if (hole && !cow_read) { | ||
290 | bio = bl_submit_bio(READ, bio); | ||
291 | /* Fill hole w/ zeroes w/o accessing device */ | ||
292 | dprintk("%s Zeroing page for hole\n", __func__); | ||
293 | zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); | ||
294 | print_page(pages[i]); | ||
295 | SetPageUptodate(pages[i]); | ||
296 | } else { | ||
297 | struct pnfs_block_extent *be_read; | ||
298 | |||
299 | be_read = (hole && cow_read) ? cow_read : be; | ||
300 | bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, | ||
301 | isect, pages[i], be_read, | ||
302 | bl_end_io_read, par); | ||
303 | if (IS_ERR(bio)) { | ||
304 | rdata->pnfs_error = PTR_ERR(bio); | ||
305 | goto out; | ||
306 | } | ||
307 | } | ||
308 | isect += PAGE_CACHE_SECTORS; | ||
309 | extent_length -= PAGE_CACHE_SECTORS; | ||
310 | } | ||
311 | if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { | ||
312 | rdata->res.eof = 1; | ||
313 | rdata->res.count = rdata->inode->i_size - f_offset; | ||
314 | } else { | ||
315 | rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; | ||
316 | } | ||
317 | out: | ||
318 | bl_put_extent(be); | ||
319 | bl_put_extent(cow_read); | ||
320 | bl_submit_bio(READ, bio); | ||
321 | put_parallel(par); | ||
322 | return PNFS_ATTEMPTED; | ||
323 | |||
324 | use_mds: | ||
325 | dprintk("Giving up and using normal NFS\n"); | ||
326 | return PNFS_NOT_ATTEMPTED; | ||
327 | } | ||
328 | |||
329 | static void mark_extents_written(struct pnfs_block_layout *bl, | ||
330 | __u64 offset, __u32 count) | ||
331 | { | ||
332 | sector_t isect, end; | ||
333 | struct pnfs_block_extent *be; | ||
334 | |||
335 | dprintk("%s(%llu, %u)\n", __func__, offset, count); | ||
336 | if (count == 0) | ||
337 | return; | ||
338 | isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; | ||
339 | end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); | ||
340 | end >>= SECTOR_SHIFT; | ||
341 | while (isect < end) { | ||
342 | sector_t len; | ||
343 | be = bl_find_get_extent(bl, isect, NULL); | ||
344 | BUG_ON(!be); /* FIXME */ | ||
345 | len = min(end, be->be_f_offset + be->be_length) - isect; | ||
346 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
347 | bl_mark_for_commit(be, isect, len); /* What if fails? */ | ||
348 | isect += len; | ||
349 | bl_put_extent(be); | ||
350 | } | ||
351 | } | ||
352 | |||
353 | static void bl_end_io_write_zero(struct bio *bio, int err) | ||
354 | { | ||
355 | struct parallel_io *par = bio->bi_private; | ||
356 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
357 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
358 | struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; | ||
359 | |||
360 | do { | ||
361 | struct page *page = bvec->bv_page; | ||
362 | |||
363 | if (--bvec >= bio->bi_io_vec) | ||
364 | prefetchw(&bvec->bv_page->flags); | ||
365 | /* This is the zeroing page we added */ | ||
366 | end_page_writeback(page); | ||
367 | page_cache_release(page); | ||
368 | } while (bvec >= bio->bi_io_vec); | ||
369 | if (!uptodate) { | ||
370 | if (!wdata->pnfs_error) | ||
371 | wdata->pnfs_error = -EIO; | ||
372 | bl_set_lo_fail(wdata->lseg); | ||
373 | } | ||
374 | bio_put(bio); | ||
375 | put_parallel(par); | ||
376 | } | ||
377 | |||
378 | /* This is basically copied from mpage_end_io_read */ | ||
379 | static void bl_end_io_write(struct bio *bio, int err) | ||
380 | { | ||
381 | struct parallel_io *par = bio->bi_private; | ||
382 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
383 | struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; | ||
384 | |||
385 | if (!uptodate) { | ||
386 | if (!wdata->pnfs_error) | ||
387 | wdata->pnfs_error = -EIO; | ||
388 | bl_set_lo_fail(wdata->lseg); | ||
389 | } | ||
390 | bio_put(bio); | ||
391 | put_parallel(par); | ||
392 | } | ||
393 | |||
394 | /* Function scheduled for call during bl_end_par_io_write, | ||
395 | * it marks sectors as written and extends the commitlist. | ||
396 | */ | ||
397 | static void bl_write_cleanup(struct work_struct *work) | ||
398 | { | ||
399 | struct rpc_task *task; | ||
400 | struct nfs_write_data *wdata; | ||
401 | dprintk("%s enter\n", __func__); | ||
402 | task = container_of(work, struct rpc_task, u.tk_work); | ||
403 | wdata = container_of(task, struct nfs_write_data, task); | ||
404 | if (!wdata->pnfs_error) { | ||
405 | /* Marks for LAYOUTCOMMIT */ | ||
406 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | ||
407 | wdata->args.offset, wdata->args.count); | ||
408 | } | ||
409 | pnfs_ld_write_done(wdata); | ||
410 | } | ||
411 | |||
412 | /* Called when last of bios associated with a bl_write_pagelist call finishes */ | ||
413 | static void bl_end_par_io_write(void *data) | ||
414 | { | ||
415 | struct nfs_write_data *wdata = data; | ||
416 | |||
417 | wdata->task.tk_status = 0; | ||
418 | wdata->verf.committed = NFS_FILE_SYNC; | ||
419 | INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); | ||
420 | schedule_work(&wdata->task.u.tk_work); | ||
421 | } | ||
422 | |||
423 | /* FIXME STUB - mark intersection of layout and page as bad, so is not | ||
424 | * used again. | ||
425 | */ | ||
426 | static void mark_bad_read(void) | ||
427 | { | ||
428 | return; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * map_block: map a requested I/0 block (isect) into an offset in the LVM | ||
433 | * block_device | ||
434 | */ | ||
435 | static void | ||
436 | map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) | ||
437 | { | ||
438 | dprintk("%s enter be=%p\n", __func__, be); | ||
439 | |||
440 | set_buffer_mapped(bh); | ||
441 | bh->b_bdev = be->be_mdev; | ||
442 | bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> | ||
443 | (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); | ||
444 | |||
445 | dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", | ||
446 | __func__, (unsigned long long)isect, (long)bh->b_blocknr, | ||
447 | bh->b_size); | ||
448 | return; | ||
449 | } | ||
450 | |||
451 | /* Given an unmapped page, zero it or read in page for COW, page is locked | ||
452 | * by caller. | ||
453 | */ | ||
454 | static int | ||
455 | init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) | ||
456 | { | ||
457 | struct buffer_head *bh = NULL; | ||
458 | int ret = 0; | ||
459 | sector_t isect; | ||
460 | |||
461 | dprintk("%s enter, %p\n", __func__, page); | ||
462 | BUG_ON(PageUptodate(page)); | ||
463 | if (!cow_read) { | ||
464 | zero_user_segment(page, 0, PAGE_SIZE); | ||
465 | SetPageUptodate(page); | ||
466 | goto cleanup; | ||
467 | } | ||
468 | |||
469 | bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); | ||
470 | if (!bh) { | ||
471 | ret = -ENOMEM; | ||
472 | goto cleanup; | ||
473 | } | ||
474 | |||
475 | isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; | ||
476 | map_block(bh, isect, cow_read); | ||
477 | if (!bh_uptodate_or_lock(bh)) | ||
478 | ret = bh_submit_read(bh); | ||
479 | if (ret) | ||
480 | goto cleanup; | ||
481 | SetPageUptodate(page); | ||
482 | |||
483 | cleanup: | ||
484 | bl_put_extent(cow_read); | ||
485 | if (bh) | ||
486 | free_buffer_head(bh); | ||
487 | if (ret) { | ||
488 | /* Need to mark layout with bad read...should now | ||
489 | * just use nfs4 for reads and writes. | ||
490 | */ | ||
491 | mark_bad_read(); | ||
492 | } | ||
493 | return ret; | ||
494 | } | ||
495 | |||
496 | static enum pnfs_try_status | ||
497 | bl_write_pagelist(struct nfs_write_data *wdata, int sync) | ||
498 | { | ||
499 | int i, ret, npg_zero, pg_index, last = 0; | ||
500 | struct bio *bio = NULL; | ||
501 | struct pnfs_block_extent *be = NULL, *cow_read = NULL; | ||
502 | sector_t isect, last_isect = 0, extent_length = 0; | ||
503 | struct parallel_io *par; | ||
504 | loff_t offset = wdata->args.offset; | ||
505 | size_t count = wdata->args.count; | ||
506 | struct page **pages = wdata->args.pages; | ||
507 | struct page *page; | ||
508 | pgoff_t index; | ||
509 | u64 temp; | ||
510 | int npg_per_block = | ||
511 | NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; | ||
512 | |||
513 | dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); | ||
514 | /* At this point, wdata->pages is a (sequential) list of nfs_pages. | ||
515 | * We want to write each, and if there is an error set pnfs_error | ||
516 | * to have it redone using nfs. | ||
517 | */ | ||
518 | par = alloc_parallel(wdata); | ||
519 | if (!par) | ||
520 | return PNFS_NOT_ATTEMPTED; | ||
521 | par->call_ops = *wdata->mds_ops; | ||
522 | par->call_ops.rpc_call_done = bl_rpc_do_nothing; | ||
523 | par->pnfs_callback = bl_end_par_io_write; | ||
524 | /* At this point, have to be more careful with error handling */ | ||
525 | |||
526 | isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | ||
527 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); | ||
528 | if (!be || !is_writable(be, isect)) { | ||
529 | dprintk("%s no matching extents!\n", __func__); | ||
530 | wdata->pnfs_error = -EINVAL; | ||
531 | goto out; | ||
532 | } | ||
533 | |||
534 | /* First page inside INVALID extent */ | ||
535 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
536 | temp = offset >> PAGE_CACHE_SHIFT; | ||
537 | npg_zero = do_div(temp, npg_per_block); | ||
538 | isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & | ||
539 | (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | ||
540 | extent_length = be->be_length - (isect - be->be_f_offset); | ||
541 | |||
542 | fill_invalid_ext: | ||
543 | dprintk("%s need to zero %d pages\n", __func__, npg_zero); | ||
544 | for (;npg_zero > 0; npg_zero--) { | ||
545 | /* page ref released in bl_end_io_write_zero */ | ||
546 | index = isect >> PAGE_CACHE_SECTOR_SHIFT; | ||
547 | dprintk("%s zero %dth page: index %lu isect %llu\n", | ||
548 | __func__, npg_zero, index, | ||
549 | (unsigned long long)isect); | ||
550 | page = | ||
551 | find_or_create_page(wdata->inode->i_mapping, index, | ||
552 | GFP_NOFS); | ||
553 | if (!page) { | ||
554 | dprintk("%s oom\n", __func__); | ||
555 | wdata->pnfs_error = -ENOMEM; | ||
556 | goto out; | ||
557 | } | ||
558 | |||
559 | /* PageDirty: Other will write this out | ||
560 | * PageWriteback: Other is writing this out | ||
561 | * PageUptodate: It was read before | ||
562 | * sector_initialized: already written out | ||
563 | */ | ||
564 | if (PageDirty(page) || PageWriteback(page) || | ||
565 | bl_is_sector_init(be->be_inval, isect)) { | ||
566 | print_page(page); | ||
567 | unlock_page(page); | ||
568 | page_cache_release(page); | ||
569 | goto next_page; | ||
570 | } | ||
571 | if (!PageUptodate(page)) { | ||
572 | /* New page, readin or zero it */ | ||
573 | init_page_for_write(page, cow_read); | ||
574 | } | ||
575 | set_page_writeback(page); | ||
576 | unlock_page(page); | ||
577 | |||
578 | ret = bl_mark_sectors_init(be->be_inval, isect, | ||
579 | PAGE_CACHE_SECTORS, | ||
580 | NULL); | ||
581 | if (unlikely(ret)) { | ||
582 | dprintk("%s bl_mark_sectors_init fail %d\n", | ||
583 | __func__, ret); | ||
584 | end_page_writeback(page); | ||
585 | page_cache_release(page); | ||
586 | wdata->pnfs_error = ret; | ||
587 | goto out; | ||
588 | } | ||
589 | bio = bl_add_page_to_bio(bio, npg_zero, WRITE, | ||
590 | isect, page, be, | ||
591 | bl_end_io_write_zero, par); | ||
592 | if (IS_ERR(bio)) { | ||
593 | wdata->pnfs_error = PTR_ERR(bio); | ||
594 | goto out; | ||
595 | } | ||
596 | /* FIXME: This should be done in bi_end_io */ | ||
597 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | ||
598 | page->index << PAGE_CACHE_SHIFT, | ||
599 | PAGE_CACHE_SIZE); | ||
600 | next_page: | ||
601 | isect += PAGE_CACHE_SECTORS; | ||
602 | extent_length -= PAGE_CACHE_SECTORS; | ||
603 | } | ||
604 | if (last) | ||
605 | goto write_done; | ||
606 | } | ||
607 | bio = bl_submit_bio(WRITE, bio); | ||
608 | |||
609 | /* Middle pages */ | ||
610 | pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; | ||
611 | for (i = pg_index; i < wdata->npages; i++) { | ||
612 | if (!extent_length) { | ||
613 | /* We've used up the previous extent */ | ||
614 | bl_put_extent(be); | ||
615 | bio = bl_submit_bio(WRITE, bio); | ||
616 | /* Get the next one */ | ||
617 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), | ||
618 | isect, NULL); | ||
619 | if (!be || !is_writable(be, isect)) { | ||
620 | wdata->pnfs_error = -EINVAL; | ||
621 | goto out; | ||
622 | } | ||
623 | extent_length = be->be_length - | ||
624 | (isect - be->be_f_offset); | ||
625 | } | ||
626 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
627 | ret = bl_mark_sectors_init(be->be_inval, isect, | ||
628 | PAGE_CACHE_SECTORS, | ||
629 | NULL); | ||
630 | if (unlikely(ret)) { | ||
631 | dprintk("%s bl_mark_sectors_init fail %d\n", | ||
632 | __func__, ret); | ||
633 | wdata->pnfs_error = ret; | ||
634 | goto out; | ||
635 | } | ||
636 | } | ||
637 | bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, | ||
638 | isect, pages[i], be, | ||
639 | bl_end_io_write, par); | ||
640 | if (IS_ERR(bio)) { | ||
641 | wdata->pnfs_error = PTR_ERR(bio); | ||
642 | goto out; | ||
643 | } | ||
644 | isect += PAGE_CACHE_SECTORS; | ||
645 | last_isect = isect; | ||
646 | extent_length -= PAGE_CACHE_SECTORS; | ||
647 | } | ||
648 | |||
649 | /* Last page inside INVALID extent */ | ||
650 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
651 | bio = bl_submit_bio(WRITE, bio); | ||
652 | temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; | ||
653 | npg_zero = npg_per_block - do_div(temp, npg_per_block); | ||
654 | if (npg_zero < npg_per_block) { | ||
655 | last = 1; | ||
656 | goto fill_invalid_ext; | ||
657 | } | ||
658 | } | ||
659 | |||
660 | write_done: | ||
661 | wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); | ||
662 | if (count < wdata->res.count) { | ||
663 | wdata->res.count = count; | ||
664 | } | ||
665 | out: | ||
666 | bl_put_extent(be); | ||
667 | bl_submit_bio(WRITE, bio); | ||
668 | put_parallel(par); | ||
669 | return PNFS_ATTEMPTED; | ||
670 | } | ||
671 | |||
672 | /* FIXME - range ignored */ | ||
673 | static void | ||
674 | release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) | ||
675 | { | ||
676 | int i; | ||
677 | struct pnfs_block_extent *be; | ||
678 | |||
679 | spin_lock(&bl->bl_ext_lock); | ||
680 | for (i = 0; i < EXTENT_LISTS; i++) { | ||
681 | while (!list_empty(&bl->bl_extents[i])) { | ||
682 | be = list_first_entry(&bl->bl_extents[i], | ||
683 | struct pnfs_block_extent, | ||
684 | be_node); | ||
685 | list_del(&be->be_node); | ||
686 | bl_put_extent(be); | ||
687 | } | ||
688 | } | ||
689 | spin_unlock(&bl->bl_ext_lock); | ||
690 | } | ||
691 | |||
692 | static void | ||
693 | release_inval_marks(struct pnfs_inval_markings *marks) | ||
694 | { | ||
695 | struct pnfs_inval_tracking *pos, *temp; | ||
696 | |||
697 | list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { | ||
698 | list_del(&pos->it_link); | ||
699 | kfree(pos); | ||
700 | } | ||
701 | return; | ||
702 | } | ||
703 | |||
704 | static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) | ||
705 | { | ||
706 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); | ||
707 | |||
708 | dprintk("%s enter\n", __func__); | ||
709 | release_extents(bl, NULL); | ||
710 | release_inval_marks(&bl->bl_inval); | ||
711 | kfree(bl); | ||
712 | } | ||
713 | |||
714 | static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, | ||
715 | gfp_t gfp_flags) | ||
716 | { | ||
717 | struct pnfs_block_layout *bl; | ||
718 | |||
719 | dprintk("%s enter\n", __func__); | ||
720 | bl = kzalloc(sizeof(*bl), gfp_flags); | ||
721 | if (!bl) | ||
722 | return NULL; | ||
723 | spin_lock_init(&bl->bl_ext_lock); | ||
724 | INIT_LIST_HEAD(&bl->bl_extents[0]); | ||
725 | INIT_LIST_HEAD(&bl->bl_extents[1]); | ||
726 | INIT_LIST_HEAD(&bl->bl_commit); | ||
727 | INIT_LIST_HEAD(&bl->bl_committing); | ||
728 | bl->bl_count = 0; | ||
729 | bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; | ||
730 | BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); | ||
731 | return &bl->bl_layout; | ||
732 | } | ||
733 | |||
734 | static void bl_free_lseg(struct pnfs_layout_segment *lseg) | ||
735 | { | ||
736 | dprintk("%s enter\n", __func__); | ||
737 | kfree(lseg); | ||
738 | } | ||
739 | |||
740 | /* We pretty much ignore lseg, and store all data layout wide, so we | ||
741 | * can correctly merge. | ||
742 | */ | ||
743 | static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, | ||
744 | struct nfs4_layoutget_res *lgr, | ||
745 | gfp_t gfp_flags) | ||
746 | { | ||
747 | struct pnfs_layout_segment *lseg; | ||
748 | int status; | ||
749 | |||
750 | dprintk("%s enter\n", __func__); | ||
751 | lseg = kzalloc(sizeof(*lseg), gfp_flags); | ||
752 | if (!lseg) | ||
753 | return ERR_PTR(-ENOMEM); | ||
754 | status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); | ||
755 | if (status) { | ||
756 | /* We don't want to call the full-blown bl_free_lseg, | ||
757 | * since on error extents were not touched. | ||
758 | */ | ||
759 | kfree(lseg); | ||
760 | return ERR_PTR(status); | ||
761 | } | ||
762 | return lseg; | ||
763 | } | ||
764 | |||
765 | static void | ||
766 | bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, | ||
767 | const struct nfs4_layoutcommit_args *arg) | ||
768 | { | ||
769 | dprintk("%s enter\n", __func__); | ||
770 | encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); | ||
771 | } | ||
772 | |||
773 | static void | ||
774 | bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) | ||
775 | { | ||
776 | struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; | ||
777 | |||
778 | dprintk("%s enter\n", __func__); | ||
779 | clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); | ||
780 | } | ||
781 | |||
782 | static void free_blk_mountid(struct block_mount_id *mid) | ||
783 | { | ||
784 | if (mid) { | ||
785 | struct pnfs_block_dev *dev; | ||
786 | spin_lock(&mid->bm_lock); | ||
787 | while (!list_empty(&mid->bm_devlist)) { | ||
788 | dev = list_first_entry(&mid->bm_devlist, | ||
789 | struct pnfs_block_dev, | ||
790 | bm_node); | ||
791 | list_del(&dev->bm_node); | ||
792 | bl_free_block_dev(dev); | ||
793 | } | ||
794 | spin_unlock(&mid->bm_lock); | ||
795 | kfree(mid); | ||
796 | } | ||
797 | } | ||
798 | |||
799 | /* This is mostly copied from the filelayout's get_device_info function. | ||
800 | * It seems much of this should be at the generic pnfs level. | ||
801 | */ | ||
802 | static struct pnfs_block_dev * | ||
803 | nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, | ||
804 | struct nfs4_deviceid *d_id) | ||
805 | { | ||
806 | struct pnfs_device *dev; | ||
807 | struct pnfs_block_dev *rv = NULL; | ||
808 | u32 max_resp_sz; | ||
809 | int max_pages; | ||
810 | struct page **pages = NULL; | ||
811 | int i, rc; | ||
812 | |||
813 | /* | ||
814 | * Use the session max response size as the basis for setting | ||
815 | * GETDEVICEINFO's maxcount | ||
816 | */ | ||
817 | max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; | ||
818 | max_pages = max_resp_sz >> PAGE_SHIFT; | ||
819 | dprintk("%s max_resp_sz %u max_pages %d\n", | ||
820 | __func__, max_resp_sz, max_pages); | ||
821 | |||
822 | dev = kmalloc(sizeof(*dev), GFP_NOFS); | ||
823 | if (!dev) { | ||
824 | dprintk("%s kmalloc failed\n", __func__); | ||
825 | return NULL; | ||
826 | } | ||
827 | |||
828 | pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); | ||
829 | if (pages == NULL) { | ||
830 | kfree(dev); | ||
831 | return NULL; | ||
832 | } | ||
833 | for (i = 0; i < max_pages; i++) { | ||
834 | pages[i] = alloc_page(GFP_NOFS); | ||
835 | if (!pages[i]) | ||
836 | goto out_free; | ||
837 | } | ||
838 | |||
839 | memcpy(&dev->dev_id, d_id, sizeof(*d_id)); | ||
840 | dev->layout_type = LAYOUT_BLOCK_VOLUME; | ||
841 | dev->pages = pages; | ||
842 | dev->pgbase = 0; | ||
843 | dev->pglen = PAGE_SIZE * max_pages; | ||
844 | dev->mincount = 0; | ||
845 | |||
846 | dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); | ||
847 | rc = nfs4_proc_getdeviceinfo(server, dev); | ||
848 | dprintk("%s getdevice info returns %d\n", __func__, rc); | ||
849 | if (rc) | ||
850 | goto out_free; | ||
851 | |||
852 | rv = nfs4_blk_decode_device(server, dev); | ||
853 | out_free: | ||
854 | for (i = 0; i < max_pages; i++) | ||
855 | __free_page(pages[i]); | ||
856 | kfree(pages); | ||
857 | kfree(dev); | ||
858 | return rv; | ||
859 | } | ||
860 | |||
861 | static int | ||
862 | bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) | ||
863 | { | ||
864 | struct block_mount_id *b_mt_id = NULL; | ||
865 | struct pnfs_devicelist *dlist = NULL; | ||
866 | struct pnfs_block_dev *bdev; | ||
867 | LIST_HEAD(block_disklist); | ||
868 | int status = 0, i; | ||
869 | |||
870 | dprintk("%s enter\n", __func__); | ||
871 | |||
872 | if (server->pnfs_blksize == 0) { | ||
873 | dprintk("%s Server did not return blksize\n", __func__); | ||
874 | return -EINVAL; | ||
875 | } | ||
876 | b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); | ||
877 | if (!b_mt_id) { | ||
878 | status = -ENOMEM; | ||
879 | goto out_error; | ||
880 | } | ||
881 | /* Initialize nfs4 block layout mount id */ | ||
882 | spin_lock_init(&b_mt_id->bm_lock); | ||
883 | INIT_LIST_HEAD(&b_mt_id->bm_devlist); | ||
884 | |||
885 | dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); | ||
886 | if (!dlist) { | ||
887 | status = -ENOMEM; | ||
888 | goto out_error; | ||
889 | } | ||
890 | dlist->eof = 0; | ||
891 | while (!dlist->eof) { | ||
892 | status = nfs4_proc_getdevicelist(server, fh, dlist); | ||
893 | if (status) | ||
894 | goto out_error; | ||
895 | dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", | ||
896 | __func__, dlist->num_devs, dlist->eof); | ||
897 | for (i = 0; i < dlist->num_devs; i++) { | ||
898 | bdev = nfs4_blk_get_deviceinfo(server, fh, | ||
899 | &dlist->dev_id[i]); | ||
900 | if (!bdev) { | ||
901 | status = -ENODEV; | ||
902 | goto out_error; | ||
903 | } | ||
904 | spin_lock(&b_mt_id->bm_lock); | ||
905 | list_add(&bdev->bm_node, &b_mt_id->bm_devlist); | ||
906 | spin_unlock(&b_mt_id->bm_lock); | ||
907 | } | ||
908 | } | ||
909 | dprintk("%s SUCCESS\n", __func__); | ||
910 | server->pnfs_ld_data = b_mt_id; | ||
911 | |||
912 | out_return: | ||
913 | kfree(dlist); | ||
914 | return status; | ||
915 | |||
916 | out_error: | ||
917 | free_blk_mountid(b_mt_id); | ||
918 | goto out_return; | ||
919 | } | ||
920 | |||
921 | static int | ||
922 | bl_clear_layoutdriver(struct nfs_server *server) | ||
923 | { | ||
924 | struct block_mount_id *b_mt_id = server->pnfs_ld_data; | ||
925 | |||
926 | dprintk("%s enter\n", __func__); | ||
927 | free_blk_mountid(b_mt_id); | ||
928 | dprintk("%s RETURNS\n", __func__); | ||
929 | return 0; | ||
930 | } | ||
931 | |||
932 | static const struct nfs_pageio_ops bl_pg_read_ops = { | ||
933 | .pg_init = pnfs_generic_pg_init_read, | ||
934 | .pg_test = pnfs_generic_pg_test, | ||
935 | .pg_doio = pnfs_generic_pg_readpages, | ||
936 | }; | ||
937 | |||
938 | static const struct nfs_pageio_ops bl_pg_write_ops = { | ||
939 | .pg_init = pnfs_generic_pg_init_write, | ||
940 | .pg_test = pnfs_generic_pg_test, | ||
941 | .pg_doio = pnfs_generic_pg_writepages, | ||
942 | }; | ||
943 | |||
944 | static struct pnfs_layoutdriver_type blocklayout_type = { | ||
945 | .id = LAYOUT_BLOCK_VOLUME, | ||
946 | .name = "LAYOUT_BLOCK_VOLUME", | ||
947 | .read_pagelist = bl_read_pagelist, | ||
948 | .write_pagelist = bl_write_pagelist, | ||
949 | .alloc_layout_hdr = bl_alloc_layout_hdr, | ||
950 | .free_layout_hdr = bl_free_layout_hdr, | ||
951 | .alloc_lseg = bl_alloc_lseg, | ||
952 | .free_lseg = bl_free_lseg, | ||
953 | .encode_layoutcommit = bl_encode_layoutcommit, | ||
954 | .cleanup_layoutcommit = bl_cleanup_layoutcommit, | ||
955 | .set_layoutdriver = bl_set_layoutdriver, | ||
956 | .clear_layoutdriver = bl_clear_layoutdriver, | ||
957 | .pg_read_ops = &bl_pg_read_ops, | ||
958 | .pg_write_ops = &bl_pg_write_ops, | ||
959 | }; | ||
960 | |||
961 | static const struct rpc_pipe_ops bl_upcall_ops = { | ||
962 | .upcall = bl_pipe_upcall, | ||
963 | .downcall = bl_pipe_downcall, | ||
964 | .destroy_msg = bl_pipe_destroy_msg, | ||
965 | }; | ||
966 | |||
967 | static int __init nfs4blocklayout_init(void) | ||
968 | { | ||
969 | struct vfsmount *mnt; | ||
970 | struct path path; | ||
971 | int ret; | ||
972 | |||
973 | dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); | ||
974 | |||
975 | ret = pnfs_register_layoutdriver(&blocklayout_type); | ||
976 | if (ret) | ||
977 | goto out; | ||
978 | |||
979 | init_waitqueue_head(&bl_wq); | ||
980 | |||
981 | mnt = rpc_get_mount(); | ||
982 | if (IS_ERR(mnt)) { | ||
983 | ret = PTR_ERR(mnt); | ||
984 | goto out_remove; | ||
985 | } | ||
986 | |||
987 | ret = vfs_path_lookup(mnt->mnt_root, | ||
988 | mnt, | ||
989 | NFS_PIPE_DIRNAME, 0, &path); | ||
990 | if (ret) | ||
991 | goto out_remove; | ||
992 | |||
993 | bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, | ||
994 | &bl_upcall_ops, 0); | ||
995 | if (IS_ERR(bl_device_pipe)) { | ||
996 | ret = PTR_ERR(bl_device_pipe); | ||
997 | goto out_remove; | ||
998 | } | ||
999 | out: | ||
1000 | return ret; | ||
1001 | |||
1002 | out_remove: | ||
1003 | pnfs_unregister_layoutdriver(&blocklayout_type); | ||
1004 | return ret; | ||
1005 | } | ||
1006 | |||
1007 | static void __exit nfs4blocklayout_exit(void) | ||
1008 | { | ||
1009 | dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", | ||
1010 | __func__); | ||
1011 | |||
1012 | pnfs_unregister_layoutdriver(&blocklayout_type); | ||
1013 | rpc_unlink(bl_device_pipe); | ||
1014 | } | ||
1015 | |||
1016 | MODULE_ALIAS("nfs-layouttype4-3"); | ||
1017 | |||
1018 | module_init(nfs4blocklayout_init); | ||
1019 | module_exit(nfs4blocklayout_exit); | ||
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 000000000000..f27d827960a3 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h | |||
@@ -0,0 +1,207 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayout.h | ||
3 | * | ||
4 | * Module for the NFSv4.1 pNFS block layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | #ifndef FS_NFS_NFS4BLOCKLAYOUT_H | ||
33 | #define FS_NFS_NFS4BLOCKLAYOUT_H | ||
34 | |||
35 | #include <linux/device-mapper.h> | ||
36 | #include <linux/nfs_fs.h> | ||
37 | #include <linux/sunrpc/rpc_pipe_fs.h> | ||
38 | |||
39 | #include "../pnfs.h" | ||
40 | |||
41 | #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) | ||
42 | #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) | ||
43 | |||
44 | struct block_mount_id { | ||
45 | spinlock_t bm_lock; /* protects list */ | ||
46 | struct list_head bm_devlist; /* holds pnfs_block_dev */ | ||
47 | }; | ||
48 | |||
49 | struct pnfs_block_dev { | ||
50 | struct list_head bm_node; | ||
51 | struct nfs4_deviceid bm_mdevid; /* associated devid */ | ||
52 | struct block_device *bm_mdev; /* meta device itself */ | ||
53 | }; | ||
54 | |||
55 | enum exstate4 { | ||
56 | PNFS_BLOCK_READWRITE_DATA = 0, | ||
57 | PNFS_BLOCK_READ_DATA = 1, | ||
58 | PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ | ||
59 | PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ | ||
60 | }; | ||
61 | |||
62 | #define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ | ||
63 | |||
64 | struct my_tree { | ||
65 | sector_t mtt_step_size; /* Internal sector alignment */ | ||
66 | struct list_head mtt_stub; /* Should be a radix tree */ | ||
67 | }; | ||
68 | |||
69 | struct pnfs_inval_markings { | ||
70 | spinlock_t im_lock; | ||
71 | struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ | ||
72 | sector_t im_block_size; /* Server blocksize in sectors */ | ||
73 | }; | ||
74 | |||
75 | struct pnfs_inval_tracking { | ||
76 | struct list_head it_link; | ||
77 | int it_sector; | ||
78 | int it_tags; | ||
79 | }; | ||
80 | |||
81 | /* sector_t fields are all in 512-byte sectors */ | ||
82 | struct pnfs_block_extent { | ||
83 | struct kref be_refcnt; | ||
84 | struct list_head be_node; /* link into lseg list */ | ||
85 | struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ | ||
86 | struct block_device *be_mdev; | ||
87 | sector_t be_f_offset; /* the starting offset in the file */ | ||
88 | sector_t be_length; /* the size of the extent */ | ||
89 | sector_t be_v_offset; /* the starting offset in the volume */ | ||
90 | enum exstate4 be_state; /* the state of this extent */ | ||
91 | struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ | ||
92 | }; | ||
93 | |||
94 | /* Shortened extent used by LAYOUTCOMMIT */ | ||
95 | struct pnfs_block_short_extent { | ||
96 | struct list_head bse_node; | ||
97 | struct nfs4_deviceid bse_devid; | ||
98 | struct block_device *bse_mdev; | ||
99 | sector_t bse_f_offset; /* the starting offset in the file */ | ||
100 | sector_t bse_length; /* the size of the extent */ | ||
101 | }; | ||
102 | |||
103 | static inline void | ||
104 | BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) | ||
105 | { | ||
106 | spin_lock_init(&marks->im_lock); | ||
107 | INIT_LIST_HEAD(&marks->im_tree.mtt_stub); | ||
108 | marks->im_block_size = blocksize; | ||
109 | marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, | ||
110 | blocksize); | ||
111 | } | ||
112 | |||
113 | enum extentclass4 { | ||
114 | RW_EXTENT = 0, /* READWRTE and INVAL */ | ||
115 | RO_EXTENT = 1, /* READ and NONE */ | ||
116 | EXTENT_LISTS = 2, | ||
117 | }; | ||
118 | |||
119 | static inline int bl_choose_list(enum exstate4 state) | ||
120 | { | ||
121 | if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) | ||
122 | return RO_EXTENT; | ||
123 | else | ||
124 | return RW_EXTENT; | ||
125 | } | ||
126 | |||
127 | struct pnfs_block_layout { | ||
128 | struct pnfs_layout_hdr bl_layout; | ||
129 | struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ | ||
130 | spinlock_t bl_ext_lock; /* Protects list manipulation */ | ||
131 | struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ | ||
132 | struct list_head bl_commit; /* Needs layout commit */ | ||
133 | struct list_head bl_committing; /* Layout committing */ | ||
134 | unsigned int bl_count; /* entries in bl_commit */ | ||
135 | sector_t bl_blocksize; /* Server blocksize in sectors */ | ||
136 | }; | ||
137 | |||
138 | #define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) | ||
139 | |||
140 | static inline struct pnfs_block_layout * | ||
141 | BLK_LO2EXT(struct pnfs_layout_hdr *lo) | ||
142 | { | ||
143 | return container_of(lo, struct pnfs_block_layout, bl_layout); | ||
144 | } | ||
145 | |||
146 | static inline struct pnfs_block_layout * | ||
147 | BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) | ||
148 | { | ||
149 | return BLK_LO2EXT(lseg->pls_layout); | ||
150 | } | ||
151 | |||
152 | struct bl_dev_msg { | ||
153 | int status; | ||
154 | uint32_t major, minor; | ||
155 | }; | ||
156 | |||
157 | struct bl_msg_hdr { | ||
158 | u8 type; | ||
159 | u16 totallen; /* length of entire message, including hdr itself */ | ||
160 | }; | ||
161 | |||
162 | extern struct dentry *bl_device_pipe; | ||
163 | extern wait_queue_head_t bl_wq; | ||
164 | |||
165 | #define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ | ||
166 | #define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ | ||
167 | #define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ | ||
168 | #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ | ||
169 | #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ | ||
170 | |||
171 | /* blocklayoutdev.c */ | ||
172 | ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, | ||
173 | char __user *, size_t); | ||
174 | ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); | ||
175 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *); | ||
176 | struct block_device *nfs4_blkdev_get(dev_t dev); | ||
177 | int nfs4_blkdev_put(struct block_device *bdev); | ||
178 | struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, | ||
179 | struct pnfs_device *dev); | ||
180 | int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | ||
181 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); | ||
182 | |||
183 | /* blocklayoutdm.c */ | ||
184 | void bl_free_block_dev(struct pnfs_block_dev *bdev); | ||
185 | |||
186 | /* extents.c */ | ||
187 | struct pnfs_block_extent * | ||
188 | bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, | ||
189 | struct pnfs_block_extent **cow_read); | ||
190 | int bl_mark_sectors_init(struct pnfs_inval_markings *marks, | ||
191 | sector_t offset, sector_t length, | ||
192 | sector_t **pages); | ||
193 | void bl_put_extent(struct pnfs_block_extent *be); | ||
194 | struct pnfs_block_extent *bl_alloc_extent(void); | ||
195 | int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); | ||
196 | int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, | ||
197 | struct xdr_stream *xdr, | ||
198 | const struct nfs4_layoutcommit_args *arg); | ||
199 | void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, | ||
200 | const struct nfs4_layoutcommit_args *arg, | ||
201 | int status); | ||
202 | int bl_add_merge_extent(struct pnfs_block_layout *bl, | ||
203 | struct pnfs_block_extent *new); | ||
204 | int bl_mark_for_commit(struct pnfs_block_extent *be, | ||
205 | sector_t offset, sector_t length); | ||
206 | |||
207 | #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ | ||
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 000000000000..a83b393fb01c --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c | |||
@@ -0,0 +1,410 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayoutdev.c | ||
3 | * | ||
4 | * Device operations for the pnfs nfs4 file layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/buffer_head.h> /* __bread */ | ||
34 | |||
35 | #include <linux/genhd.h> | ||
36 | #include <linux/blkdev.h> | ||
37 | #include <linux/hash.h> | ||
38 | |||
39 | #include "blocklayout.h" | ||
40 | |||
41 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
42 | |||
43 | static int decode_sector_number(__be32 **rp, sector_t *sp) | ||
44 | { | ||
45 | uint64_t s; | ||
46 | |||
47 | *rp = xdr_decode_hyper(*rp, &s); | ||
48 | if (s & 0x1ff) { | ||
49 | printk(KERN_WARNING "%s: sector not aligned\n", __func__); | ||
50 | return -1; | ||
51 | } | ||
52 | *sp = s >> SECTOR_SHIFT; | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | /* Open a block_device by device number. */ | ||
57 | struct block_device *nfs4_blkdev_get(dev_t dev) | ||
58 | { | ||
59 | struct block_device *bd; | ||
60 | |||
61 | dprintk("%s enter\n", __func__); | ||
62 | bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); | ||
63 | if (IS_ERR(bd)) | ||
64 | goto fail; | ||
65 | return bd; | ||
66 | fail: | ||
67 | dprintk("%s failed to open device : %ld\n", | ||
68 | __func__, PTR_ERR(bd)); | ||
69 | return NULL; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Release the block device | ||
74 | */ | ||
75 | int nfs4_blkdev_put(struct block_device *bdev) | ||
76 | { | ||
77 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), | ||
78 | MINOR(bdev->bd_dev)); | ||
79 | return blkdev_put(bdev, FMODE_READ); | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Shouldn't there be a rpc_generic_upcall() to do this for us? | ||
84 | */ | ||
85 | ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, | ||
86 | char __user *dst, size_t buflen) | ||
87 | { | ||
88 | char *data = (char *)msg->data + msg->copied; | ||
89 | size_t mlen = min(msg->len - msg->copied, buflen); | ||
90 | unsigned long left; | ||
91 | |||
92 | left = copy_to_user(dst, data, mlen); | ||
93 | if (left == mlen) { | ||
94 | msg->errno = -EFAULT; | ||
95 | return -EFAULT; | ||
96 | } | ||
97 | |||
98 | mlen -= left; | ||
99 | msg->copied += mlen; | ||
100 | msg->errno = 0; | ||
101 | return mlen; | ||
102 | } | ||
103 | |||
104 | static struct bl_dev_msg bl_mount_reply; | ||
105 | |||
106 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, | ||
107 | size_t mlen) | ||
108 | { | ||
109 | if (mlen != sizeof (struct bl_dev_msg)) | ||
110 | return -EINVAL; | ||
111 | |||
112 | if (copy_from_user(&bl_mount_reply, src, mlen) != 0) | ||
113 | return -EFAULT; | ||
114 | |||
115 | wake_up(&bl_wq); | ||
116 | |||
117 | return mlen; | ||
118 | } | ||
119 | |||
120 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) | ||
121 | { | ||
122 | if (msg->errno >= 0) | ||
123 | return; | ||
124 | wake_up(&bl_wq); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | ||
129 | */ | ||
130 | struct pnfs_block_dev * | ||
131 | nfs4_blk_decode_device(struct nfs_server *server, | ||
132 | struct pnfs_device *dev) | ||
133 | { | ||
134 | struct pnfs_block_dev *rv = NULL; | ||
135 | struct block_device *bd = NULL; | ||
136 | struct rpc_pipe_msg msg; | ||
137 | struct bl_msg_hdr bl_msg = { | ||
138 | .type = BL_DEVICE_MOUNT, | ||
139 | .totallen = dev->mincount, | ||
140 | }; | ||
141 | uint8_t *dataptr; | ||
142 | DECLARE_WAITQUEUE(wq, current); | ||
143 | struct bl_dev_msg *reply = &bl_mount_reply; | ||
144 | int offset, len, i; | ||
145 | |||
146 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | ||
147 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | ||
148 | dev->mincount); | ||
149 | |||
150 | memset(&msg, 0, sizeof(msg)); | ||
151 | msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); | ||
152 | if (!msg.data) { | ||
153 | rv = ERR_PTR(-ENOMEM); | ||
154 | goto out; | ||
155 | } | ||
156 | |||
157 | memcpy(msg.data, &bl_msg, sizeof(bl_msg)); | ||
158 | dataptr = (uint8_t *) msg.data; | ||
159 | len = dev->mincount; | ||
160 | offset = sizeof(bl_msg); | ||
161 | for (i = 0; len > 0; i++) { | ||
162 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | ||
163 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | ||
164 | len -= PAGE_CACHE_SIZE; | ||
165 | offset += PAGE_CACHE_SIZE; | ||
166 | } | ||
167 | msg.len = sizeof(bl_msg) + dev->mincount; | ||
168 | |||
169 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | ||
170 | add_wait_queue(&bl_wq, &wq); | ||
171 | if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { | ||
172 | remove_wait_queue(&bl_wq, &wq); | ||
173 | goto out; | ||
174 | } | ||
175 | |||
176 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
177 | schedule(); | ||
178 | __set_current_state(TASK_RUNNING); | ||
179 | remove_wait_queue(&bl_wq, &wq); | ||
180 | |||
181 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | ||
182 | dprintk("%s failed to open device: %d\n", | ||
183 | __func__, reply->status); | ||
184 | rv = ERR_PTR(-EINVAL); | ||
185 | goto out; | ||
186 | } | ||
187 | |||
188 | bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); | ||
189 | if (IS_ERR(bd)) { | ||
190 | dprintk("%s failed to open device : %ld\n", | ||
191 | __func__, PTR_ERR(bd)); | ||
192 | goto out; | ||
193 | } | ||
194 | |||
195 | rv = kzalloc(sizeof(*rv), GFP_NOFS); | ||
196 | if (!rv) { | ||
197 | rv = ERR_PTR(-ENOMEM); | ||
198 | goto out; | ||
199 | } | ||
200 | |||
201 | rv->bm_mdev = bd; | ||
202 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); | ||
203 | dprintk("%s Created device %s with bd_block_size %u\n", | ||
204 | __func__, | ||
205 | bd->bd_disk->disk_name, | ||
206 | bd->bd_block_size); | ||
207 | |||
208 | out: | ||
209 | kfree(msg.data); | ||
210 | return rv; | ||
211 | } | ||
212 | |||
213 | /* Map deviceid returned by the server to constructed block_device */ | ||
214 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, | ||
215 | struct nfs4_deviceid *id) | ||
216 | { | ||
217 | struct block_device *rv = NULL; | ||
218 | struct block_mount_id *mid; | ||
219 | struct pnfs_block_dev *dev; | ||
220 | |||
221 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); | ||
222 | mid = BLK_ID(lo); | ||
223 | spin_lock(&mid->bm_lock); | ||
224 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { | ||
225 | if (memcmp(id->data, dev->bm_mdevid.data, | ||
226 | NFS4_DEVICEID4_SIZE) == 0) { | ||
227 | rv = dev->bm_mdev; | ||
228 | goto out; | ||
229 | } | ||
230 | } | ||
231 | out: | ||
232 | spin_unlock(&mid->bm_lock); | ||
233 | dprintk("%s returning %p\n", __func__, rv); | ||
234 | return rv; | ||
235 | } | ||
236 | |||
237 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ | ||
238 | struct layout_verification { | ||
239 | u32 mode; /* R or RW */ | ||
240 | u64 start; /* Expected start of next non-COW extent */ | ||
241 | u64 inval; /* Start of INVAL coverage */ | ||
242 | u64 cowread; /* End of COW read coverage */ | ||
243 | }; | ||
244 | |||
245 | /* Verify the extent meets the layout requirements of the pnfs-block draft, | ||
246 | * section 2.3.1. | ||
247 | */ | ||
248 | static int verify_extent(struct pnfs_block_extent *be, | ||
249 | struct layout_verification *lv) | ||
250 | { | ||
251 | if (lv->mode == IOMODE_READ) { | ||
252 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || | ||
253 | be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
254 | return -EIO; | ||
255 | if (be->be_f_offset != lv->start) | ||
256 | return -EIO; | ||
257 | lv->start += be->be_length; | ||
258 | return 0; | ||
259 | } | ||
260 | /* lv->mode == IOMODE_RW */ | ||
261 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { | ||
262 | if (be->be_f_offset != lv->start) | ||
263 | return -EIO; | ||
264 | if (lv->cowread > lv->start) | ||
265 | return -EIO; | ||
266 | lv->start += be->be_length; | ||
267 | lv->inval = lv->start; | ||
268 | return 0; | ||
269 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
270 | if (be->be_f_offset != lv->start) | ||
271 | return -EIO; | ||
272 | lv->start += be->be_length; | ||
273 | return 0; | ||
274 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { | ||
275 | if (be->be_f_offset > lv->start) | ||
276 | return -EIO; | ||
277 | if (be->be_f_offset < lv->inval) | ||
278 | return -EIO; | ||
279 | if (be->be_f_offset < lv->cowread) | ||
280 | return -EIO; | ||
281 | /* It looks like you might want to min this with lv->start, | ||
282 | * but you really don't. | ||
283 | */ | ||
284 | lv->inval = lv->inval + be->be_length; | ||
285 | lv->cowread = be->be_f_offset + be->be_length; | ||
286 | return 0; | ||
287 | } else | ||
288 | return -EIO; | ||
289 | } | ||
290 | |||
291 | /* XDR decode pnfs_block_layout4 structure */ | ||
292 | int | ||
293 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | ||
294 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) | ||
295 | { | ||
296 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); | ||
297 | int i, status = -EIO; | ||
298 | uint32_t count; | ||
299 | struct pnfs_block_extent *be = NULL, *save; | ||
300 | struct xdr_stream stream; | ||
301 | struct xdr_buf buf; | ||
302 | struct page *scratch; | ||
303 | __be32 *p; | ||
304 | struct layout_verification lv = { | ||
305 | .mode = lgr->range.iomode, | ||
306 | .start = lgr->range.offset >> SECTOR_SHIFT, | ||
307 | .inval = lgr->range.offset >> SECTOR_SHIFT, | ||
308 | .cowread = lgr->range.offset >> SECTOR_SHIFT, | ||
309 | }; | ||
310 | LIST_HEAD(extents); | ||
311 | |||
312 | dprintk("---> %s\n", __func__); | ||
313 | |||
314 | scratch = alloc_page(gfp_flags); | ||
315 | if (!scratch) | ||
316 | return -ENOMEM; | ||
317 | |||
318 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | ||
319 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | ||
320 | |||
321 | p = xdr_inline_decode(&stream, 4); | ||
322 | if (unlikely(!p)) | ||
323 | goto out_err; | ||
324 | |||
325 | count = be32_to_cpup(p++); | ||
326 | |||
327 | dprintk("%s enter, number of extents %i\n", __func__, count); | ||
328 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); | ||
329 | if (unlikely(!p)) | ||
330 | goto out_err; | ||
331 | |||
332 | /* Decode individual extents, putting them in temporary | ||
333 | * staging area until whole layout is decoded to make error | ||
334 | * recovery easier. | ||
335 | */ | ||
336 | for (i = 0; i < count; i++) { | ||
337 | be = bl_alloc_extent(); | ||
338 | if (!be) { | ||
339 | status = -ENOMEM; | ||
340 | goto out_err; | ||
341 | } | ||
342 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); | ||
343 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | ||
344 | be->be_mdev = translate_devid(lo, &be->be_devid); | ||
345 | if (!be->be_mdev) | ||
346 | goto out_err; | ||
347 | |||
348 | /* The next three values are read in as bytes, | ||
349 | * but stored as 512-byte sector lengths | ||
350 | */ | ||
351 | if (decode_sector_number(&p, &be->be_f_offset) < 0) | ||
352 | goto out_err; | ||
353 | if (decode_sector_number(&p, &be->be_length) < 0) | ||
354 | goto out_err; | ||
355 | if (decode_sector_number(&p, &be->be_v_offset) < 0) | ||
356 | goto out_err; | ||
357 | be->be_state = be32_to_cpup(p++); | ||
358 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
359 | be->be_inval = &bl->bl_inval; | ||
360 | if (verify_extent(be, &lv)) { | ||
361 | dprintk("%s verify failed\n", __func__); | ||
362 | goto out_err; | ||
363 | } | ||
364 | list_add_tail(&be->be_node, &extents); | ||
365 | } | ||
366 | if (lgr->range.offset + lgr->range.length != | ||
367 | lv.start << SECTOR_SHIFT) { | ||
368 | dprintk("%s Final length mismatch\n", __func__); | ||
369 | be = NULL; | ||
370 | goto out_err; | ||
371 | } | ||
372 | if (lv.start < lv.cowread) { | ||
373 | dprintk("%s Final uncovered COW extent\n", __func__); | ||
374 | be = NULL; | ||
375 | goto out_err; | ||
376 | } | ||
377 | /* Extents decoded properly, now try to merge them in to | ||
378 | * existing layout extents. | ||
379 | */ | ||
380 | spin_lock(&bl->bl_ext_lock); | ||
381 | list_for_each_entry_safe(be, save, &extents, be_node) { | ||
382 | list_del(&be->be_node); | ||
383 | status = bl_add_merge_extent(bl, be); | ||
384 | if (status) { | ||
385 | spin_unlock(&bl->bl_ext_lock); | ||
386 | /* This is a fairly catastrophic error, as the | ||
387 | * entire layout extent lists are now corrupted. | ||
388 | * We should have some way to distinguish this. | ||
389 | */ | ||
390 | be = NULL; | ||
391 | goto out_err; | ||
392 | } | ||
393 | } | ||
394 | spin_unlock(&bl->bl_ext_lock); | ||
395 | status = 0; | ||
396 | out: | ||
397 | __free_page(scratch); | ||
398 | dprintk("%s returns %i\n", __func__, status); | ||
399 | return status; | ||
400 | |||
401 | out_err: | ||
402 | bl_put_extent(be); | ||
403 | while (!list_empty(&extents)) { | ||
404 | be = list_first_entry(&extents, struct pnfs_block_extent, | ||
405 | be_node); | ||
406 | list_del(&be->be_node); | ||
407 | bl_put_extent(be); | ||
408 | } | ||
409 | goto out; | ||
410 | } | ||
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 000000000000..d055c7558073 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayoutdm.c | ||
3 | * | ||
4 | * Module for the NFSv4.1 pNFS block layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2007 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Fred Isaman <iisaman@umich.edu> | ||
10 | * Andy Adamson <andros@citi.umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | |||
33 | #include <linux/genhd.h> /* gendisk - used in a dprintk*/ | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/hash.h> | ||
36 | |||
37 | #include "blocklayout.h" | ||
38 | |||
39 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
40 | |||
41 | static void dev_remove(dev_t dev) | ||
42 | { | ||
43 | struct rpc_pipe_msg msg; | ||
44 | struct bl_dev_msg bl_umount_request; | ||
45 | struct bl_msg_hdr bl_msg = { | ||
46 | .type = BL_DEVICE_UMOUNT, | ||
47 | .totallen = sizeof(bl_umount_request), | ||
48 | }; | ||
49 | uint8_t *dataptr; | ||
50 | DECLARE_WAITQUEUE(wq, current); | ||
51 | |||
52 | dprintk("Entering %s\n", __func__); | ||
53 | |||
54 | memset(&msg, 0, sizeof(msg)); | ||
55 | msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); | ||
56 | if (!msg.data) | ||
57 | goto out; | ||
58 | |||
59 | memset(&bl_umount_request, 0, sizeof(bl_umount_request)); | ||
60 | bl_umount_request.major = MAJOR(dev); | ||
61 | bl_umount_request.minor = MINOR(dev); | ||
62 | |||
63 | memcpy(msg.data, &bl_msg, sizeof(bl_msg)); | ||
64 | dataptr = (uint8_t *) msg.data; | ||
65 | memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); | ||
66 | msg.len = sizeof(bl_msg) + bl_msg.totallen; | ||
67 | |||
68 | add_wait_queue(&bl_wq, &wq); | ||
69 | if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { | ||
70 | remove_wait_queue(&bl_wq, &wq); | ||
71 | goto out; | ||
72 | } | ||
73 | |||
74 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
75 | schedule(); | ||
76 | __set_current_state(TASK_RUNNING); | ||
77 | remove_wait_queue(&bl_wq, &wq); | ||
78 | |||
79 | out: | ||
80 | kfree(msg.data); | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Release meta device | ||
85 | */ | ||
86 | static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) | ||
87 | { | ||
88 | int rv; | ||
89 | |||
90 | dprintk("%s Releasing\n", __func__); | ||
91 | rv = nfs4_blkdev_put(bdev->bm_mdev); | ||
92 | if (rv) | ||
93 | printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", | ||
94 | __func__, rv); | ||
95 | |||
96 | dev_remove(bdev->bm_mdev->bd_dev); | ||
97 | } | ||
98 | |||
99 | void bl_free_block_dev(struct pnfs_block_dev *bdev) | ||
100 | { | ||
101 | if (bdev) { | ||
102 | if (bdev->bm_mdev) { | ||
103 | dprintk("%s Removing DM device: %d:%d\n", | ||
104 | __func__, | ||
105 | MAJOR(bdev->bm_mdev->bd_dev), | ||
106 | MINOR(bdev->bm_mdev->bd_dev)); | ||
107 | nfs4_blk_metadev_release(bdev); | ||
108 | } | ||
109 | kfree(bdev); | ||
110 | } | ||
111 | } | ||
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 000000000000..19fa7b0b8c00 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c | |||
@@ -0,0 +1,935 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayout.h | ||
3 | * | ||
4 | * Module for the NFSv4.1 pNFS block layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | |||
33 | #include "blocklayout.h" | ||
34 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
35 | |||
36 | /* Bit numbers */ | ||
37 | #define EXTENT_INITIALIZED 0 | ||
38 | #define EXTENT_WRITTEN 1 | ||
39 | #define EXTENT_IN_COMMIT 2 | ||
40 | #define INTERNAL_EXISTS MY_MAX_TAGS | ||
41 | #define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) | ||
42 | |||
43 | /* Returns largest t<=s s.t. t%base==0 */ | ||
44 | static inline sector_t normalize(sector_t s, int base) | ||
45 | { | ||
46 | sector_t tmp = s; /* Since do_div modifies its argument */ | ||
47 | return s - do_div(tmp, base); | ||
48 | } | ||
49 | |||
50 | static inline sector_t normalize_up(sector_t s, int base) | ||
51 | { | ||
52 | return normalize(s + base - 1, base); | ||
53 | } | ||
54 | |||
55 | /* Complete stub using list while determine API wanted */ | ||
56 | |||
57 | /* Returns tags, or negative */ | ||
58 | static int32_t _find_entry(struct my_tree *tree, u64 s) | ||
59 | { | ||
60 | struct pnfs_inval_tracking *pos; | ||
61 | |||
62 | dprintk("%s(%llu) enter\n", __func__, s); | ||
63 | list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { | ||
64 | if (pos->it_sector > s) | ||
65 | continue; | ||
66 | else if (pos->it_sector == s) | ||
67 | return pos->it_tags & INTERNAL_MASK; | ||
68 | else | ||
69 | break; | ||
70 | } | ||
71 | return -ENOENT; | ||
72 | } | ||
73 | |||
74 | static inline | ||
75 | int _has_tag(struct my_tree *tree, u64 s, int32_t tag) | ||
76 | { | ||
77 | int32_t tags; | ||
78 | |||
79 | dprintk("%s(%llu, %i) enter\n", __func__, s, tag); | ||
80 | s = normalize(s, tree->mtt_step_size); | ||
81 | tags = _find_entry(tree, s); | ||
82 | if ((tags < 0) || !(tags & (1 << tag))) | ||
83 | return 0; | ||
84 | else | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | /* Creates entry with tag, or if entry already exists, unions tag to it. | ||
89 | * If storage is not NULL, newly created entry will use it. | ||
90 | * Returns number of entries added, or negative on error. | ||
91 | */ | ||
92 | static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, | ||
93 | struct pnfs_inval_tracking *storage) | ||
94 | { | ||
95 | int found = 0; | ||
96 | struct pnfs_inval_tracking *pos; | ||
97 | |||
98 | dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); | ||
99 | list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { | ||
100 | if (pos->it_sector > s) | ||
101 | continue; | ||
102 | else if (pos->it_sector == s) { | ||
103 | found = 1; | ||
104 | break; | ||
105 | } else | ||
106 | break; | ||
107 | } | ||
108 | if (found) { | ||
109 | pos->it_tags |= (1 << tag); | ||
110 | return 0; | ||
111 | } else { | ||
112 | struct pnfs_inval_tracking *new; | ||
113 | if (storage) | ||
114 | new = storage; | ||
115 | else { | ||
116 | new = kmalloc(sizeof(*new), GFP_NOFS); | ||
117 | if (!new) | ||
118 | return -ENOMEM; | ||
119 | } | ||
120 | new->it_sector = s; | ||
121 | new->it_tags = (1 << tag); | ||
122 | list_add(&new->it_link, &pos->it_link); | ||
123 | return 1; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | /* XXXX Really want option to not create */ | ||
128 | /* Over range, unions tag with existing entries, else creates entry with tag */ | ||
129 | static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) | ||
130 | { | ||
131 | u64 i; | ||
132 | |||
133 | dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); | ||
134 | for (i = normalize(s, tree->mtt_step_size); i < s + length; | ||
135 | i += tree->mtt_step_size) | ||
136 | if (_add_entry(tree, i, tag, NULL)) | ||
137 | return -ENOMEM; | ||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | /* Ensure that future operations on given range of tree will not malloc */ | ||
142 | static int _preload_range(struct my_tree *tree, u64 offset, u64 length) | ||
143 | { | ||
144 | u64 start, end, s; | ||
145 | int count, i, used = 0, status = -ENOMEM; | ||
146 | struct pnfs_inval_tracking **storage; | ||
147 | |||
148 | dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); | ||
149 | start = normalize(offset, tree->mtt_step_size); | ||
150 | end = normalize_up(offset + length, tree->mtt_step_size); | ||
151 | count = (int)(end - start) / (int)tree->mtt_step_size; | ||
152 | |||
153 | /* Pre-malloc what memory we might need */ | ||
154 | storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); | ||
155 | if (!storage) | ||
156 | return -ENOMEM; | ||
157 | for (i = 0; i < count; i++) { | ||
158 | storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), | ||
159 | GFP_NOFS); | ||
160 | if (!storage[i]) | ||
161 | goto out_cleanup; | ||
162 | } | ||
163 | |||
164 | /* Now need lock - HOW??? */ | ||
165 | |||
166 | for (s = start; s < end; s += tree->mtt_step_size) | ||
167 | used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); | ||
168 | |||
169 | /* Unlock - HOW??? */ | ||
170 | status = 0; | ||
171 | |||
172 | out_cleanup: | ||
173 | for (i = used; i < count; i++) { | ||
174 | if (!storage[i]) | ||
175 | break; | ||
176 | kfree(storage[i]); | ||
177 | } | ||
178 | kfree(storage); | ||
179 | return status; | ||
180 | } | ||
181 | |||
182 | static void set_needs_init(sector_t *array, sector_t offset) | ||
183 | { | ||
184 | sector_t *p = array; | ||
185 | |||
186 | dprintk("%s enter\n", __func__); | ||
187 | if (!p) | ||
188 | return; | ||
189 | while (*p < offset) | ||
190 | p++; | ||
191 | if (*p == offset) | ||
192 | return; | ||
193 | else if (*p == ~0) { | ||
194 | *p++ = offset; | ||
195 | *p = ~0; | ||
196 | return; | ||
197 | } else { | ||
198 | sector_t *save = p; | ||
199 | dprintk("%s Adding %llu\n", __func__, (u64)offset); | ||
200 | while (*p != ~0) | ||
201 | p++; | ||
202 | p++; | ||
203 | memmove(save + 1, save, (char *)p - (char *)save); | ||
204 | *save = offset; | ||
205 | return; | ||
206 | } | ||
207 | } | ||
208 | |||
209 | /* We are relying on page lock to serialize this */ | ||
210 | int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) | ||
211 | { | ||
212 | int rv; | ||
213 | |||
214 | spin_lock(&marks->im_lock); | ||
215 | rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); | ||
216 | spin_unlock(&marks->im_lock); | ||
217 | return rv; | ||
218 | } | ||
219 | |||
220 | /* Assume start, end already sector aligned */ | ||
221 | static int | ||
222 | _range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) | ||
223 | { | ||
224 | struct pnfs_inval_tracking *pos; | ||
225 | u64 expect = 0; | ||
226 | |||
227 | dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); | ||
228 | list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { | ||
229 | if (pos->it_sector >= end) | ||
230 | continue; | ||
231 | if (!expect) { | ||
232 | if ((pos->it_sector == end - tree->mtt_step_size) && | ||
233 | (pos->it_tags & (1 << tag))) { | ||
234 | expect = pos->it_sector - tree->mtt_step_size; | ||
235 | if (pos->it_sector < tree->mtt_step_size || expect < start) | ||
236 | return 1; | ||
237 | continue; | ||
238 | } else { | ||
239 | return 0; | ||
240 | } | ||
241 | } | ||
242 | if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) | ||
243 | return 0; | ||
244 | expect -= tree->mtt_step_size; | ||
245 | if (expect < start) | ||
246 | return 1; | ||
247 | } | ||
248 | return 0; | ||
249 | } | ||
250 | |||
251 | static int is_range_written(struct pnfs_inval_markings *marks, | ||
252 | sector_t start, sector_t end) | ||
253 | { | ||
254 | int rv; | ||
255 | |||
256 | spin_lock(&marks->im_lock); | ||
257 | rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); | ||
258 | spin_unlock(&marks->im_lock); | ||
259 | return rv; | ||
260 | } | ||
261 | |||
262 | /* Marks sectors in [offest, offset_length) as having been initialized. | ||
263 | * All lengths are step-aligned, where step is min(pagesize, blocksize). | ||
264 | * Notes where partial block is initialized, and helps prepare it for | ||
265 | * complete initialization later. | ||
266 | */ | ||
267 | /* Currently assumes offset is page-aligned */ | ||
268 | int bl_mark_sectors_init(struct pnfs_inval_markings *marks, | ||
269 | sector_t offset, sector_t length, | ||
270 | sector_t **pages) | ||
271 | { | ||
272 | sector_t s, start, end; | ||
273 | sector_t *array = NULL; /* Pages to mark */ | ||
274 | |||
275 | dprintk("%s(offset=%llu,len=%llu) enter\n", | ||
276 | __func__, (u64)offset, (u64)length); | ||
277 | s = max((sector_t) 3, | ||
278 | 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); | ||
279 | dprintk("%s set max=%llu\n", __func__, (u64)s); | ||
280 | if (pages) { | ||
281 | array = kmalloc(s * sizeof(sector_t), GFP_NOFS); | ||
282 | if (!array) | ||
283 | goto outerr; | ||
284 | array[0] = ~0; | ||
285 | } | ||
286 | |||
287 | start = normalize(offset, marks->im_block_size); | ||
288 | end = normalize_up(offset + length, marks->im_block_size); | ||
289 | if (_preload_range(&marks->im_tree, start, end - start)) | ||
290 | goto outerr; | ||
291 | |||
292 | spin_lock(&marks->im_lock); | ||
293 | |||
294 | for (s = normalize_up(start, PAGE_CACHE_SECTORS); | ||
295 | s < offset; s += PAGE_CACHE_SECTORS) { | ||
296 | dprintk("%s pre-area pages\n", __func__); | ||
297 | /* Portion of used block is not initialized */ | ||
298 | if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) | ||
299 | set_needs_init(array, s); | ||
300 | } | ||
301 | if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) | ||
302 | goto out_unlock; | ||
303 | for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); | ||
304 | s < end; s += PAGE_CACHE_SECTORS) { | ||
305 | dprintk("%s post-area pages\n", __func__); | ||
306 | if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) | ||
307 | set_needs_init(array, s); | ||
308 | } | ||
309 | |||
310 | spin_unlock(&marks->im_lock); | ||
311 | |||
312 | if (pages) { | ||
313 | if (array[0] == ~0) { | ||
314 | kfree(array); | ||
315 | *pages = NULL; | ||
316 | } else | ||
317 | *pages = array; | ||
318 | } | ||
319 | return 0; | ||
320 | |||
321 | out_unlock: | ||
322 | spin_unlock(&marks->im_lock); | ||
323 | outerr: | ||
324 | if (pages) { | ||
325 | kfree(array); | ||
326 | *pages = NULL; | ||
327 | } | ||
328 | return -ENOMEM; | ||
329 | } | ||
330 | |||
331 | /* Marks sectors in [offest, offset+length) as having been written to disk. | ||
332 | * All lengths should be block aligned. | ||
333 | */ | ||
334 | static int mark_written_sectors(struct pnfs_inval_markings *marks, | ||
335 | sector_t offset, sector_t length) | ||
336 | { | ||
337 | int status; | ||
338 | |||
339 | dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, | ||
340 | (u64)offset, (u64)length); | ||
341 | spin_lock(&marks->im_lock); | ||
342 | status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); | ||
343 | spin_unlock(&marks->im_lock); | ||
344 | return status; | ||
345 | } | ||
346 | |||
347 | static void print_short_extent(struct pnfs_block_short_extent *be) | ||
348 | { | ||
349 | dprintk("PRINT SHORT EXTENT extent %p\n", be); | ||
350 | if (be) { | ||
351 | dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); | ||
352 | dprintk(" be_length %llu\n", (u64)be->bse_length); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | static void print_clist(struct list_head *list, unsigned int count) | ||
357 | { | ||
358 | struct pnfs_block_short_extent *be; | ||
359 | unsigned int i = 0; | ||
360 | |||
361 | ifdebug(FACILITY) { | ||
362 | printk(KERN_DEBUG "****************\n"); | ||
363 | printk(KERN_DEBUG "Extent list looks like:\n"); | ||
364 | list_for_each_entry(be, list, bse_node) { | ||
365 | i++; | ||
366 | print_short_extent(be); | ||
367 | } | ||
368 | if (i != count) | ||
369 | printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); | ||
370 | printk(KERN_DEBUG "****************\n"); | ||
371 | } | ||
372 | } | ||
373 | |||
374 | /* Note: In theory, we should do more checking that devid's match between | ||
375 | * old and new, but if they don't, the lists are too corrupt to salvage anyway. | ||
376 | */ | ||
377 | /* Note this is very similar to bl_add_merge_extent */ | ||
378 | static void add_to_commitlist(struct pnfs_block_layout *bl, | ||
379 | struct pnfs_block_short_extent *new) | ||
380 | { | ||
381 | struct list_head *clist = &bl->bl_commit; | ||
382 | struct pnfs_block_short_extent *old, *save; | ||
383 | sector_t end = new->bse_f_offset + new->bse_length; | ||
384 | |||
385 | dprintk("%s enter\n", __func__); | ||
386 | print_short_extent(new); | ||
387 | print_clist(clist, bl->bl_count); | ||
388 | bl->bl_count++; | ||
389 | /* Scan for proper place to insert, extending new to the left | ||
390 | * as much as possible. | ||
391 | */ | ||
392 | list_for_each_entry_safe(old, save, clist, bse_node) { | ||
393 | if (new->bse_f_offset < old->bse_f_offset) | ||
394 | break; | ||
395 | if (end <= old->bse_f_offset + old->bse_length) { | ||
396 | /* Range is already in list */ | ||
397 | bl->bl_count--; | ||
398 | kfree(new); | ||
399 | return; | ||
400 | } else if (new->bse_f_offset <= | ||
401 | old->bse_f_offset + old->bse_length) { | ||
402 | /* new overlaps or abuts existing be */ | ||
403 | if (new->bse_mdev == old->bse_mdev) { | ||
404 | /* extend new to fully replace old */ | ||
405 | new->bse_length += new->bse_f_offset - | ||
406 | old->bse_f_offset; | ||
407 | new->bse_f_offset = old->bse_f_offset; | ||
408 | list_del(&old->bse_node); | ||
409 | bl->bl_count--; | ||
410 | kfree(old); | ||
411 | } | ||
412 | } | ||
413 | } | ||
414 | /* Note that if we never hit the above break, old will not point to a | ||
415 | * valid extent. However, in that case &old->bse_node==list. | ||
416 | */ | ||
417 | list_add_tail(&new->bse_node, &old->bse_node); | ||
418 | /* Scan forward for overlaps. If we find any, extend new and | ||
419 | * remove the overlapped extent. | ||
420 | */ | ||
421 | old = list_prepare_entry(new, clist, bse_node); | ||
422 | list_for_each_entry_safe_continue(old, save, clist, bse_node) { | ||
423 | if (end < old->bse_f_offset) | ||
424 | break; | ||
425 | /* new overlaps or abuts old */ | ||
426 | if (new->bse_mdev == old->bse_mdev) { | ||
427 | if (end < old->bse_f_offset + old->bse_length) { | ||
428 | /* extend new to fully cover old */ | ||
429 | end = old->bse_f_offset + old->bse_length; | ||
430 | new->bse_length = end - new->bse_f_offset; | ||
431 | } | ||
432 | list_del(&old->bse_node); | ||
433 | bl->bl_count--; | ||
434 | kfree(old); | ||
435 | } | ||
436 | } | ||
437 | dprintk("%s: after merging\n", __func__); | ||
438 | print_clist(clist, bl->bl_count); | ||
439 | } | ||
440 | |||
441 | /* Note the range described by offset, length is guaranteed to be contained | ||
442 | * within be. | ||
443 | */ | ||
444 | int bl_mark_for_commit(struct pnfs_block_extent *be, | ||
445 | sector_t offset, sector_t length) | ||
446 | { | ||
447 | sector_t new_end, end = offset + length; | ||
448 | struct pnfs_block_short_extent *new; | ||
449 | struct pnfs_block_layout *bl = container_of(be->be_inval, | ||
450 | struct pnfs_block_layout, | ||
451 | bl_inval); | ||
452 | |||
453 | new = kmalloc(sizeof(*new), GFP_NOFS); | ||
454 | if (!new) | ||
455 | return -ENOMEM; | ||
456 | |||
457 | mark_written_sectors(be->be_inval, offset, length); | ||
458 | /* We want to add the range to commit list, but it must be | ||
459 | * block-normalized, and verified that the normalized range has | ||
460 | * been entirely written to disk. | ||
461 | */ | ||
462 | new->bse_f_offset = offset; | ||
463 | offset = normalize(offset, bl->bl_blocksize); | ||
464 | if (offset < new->bse_f_offset) { | ||
465 | if (is_range_written(be->be_inval, offset, new->bse_f_offset)) | ||
466 | new->bse_f_offset = offset; | ||
467 | else | ||
468 | new->bse_f_offset = offset + bl->bl_blocksize; | ||
469 | } | ||
470 | new_end = normalize_up(end, bl->bl_blocksize); | ||
471 | if (end < new_end) { | ||
472 | if (is_range_written(be->be_inval, end, new_end)) | ||
473 | end = new_end; | ||
474 | else | ||
475 | end = new_end - bl->bl_blocksize; | ||
476 | } | ||
477 | if (end <= new->bse_f_offset) { | ||
478 | kfree(new); | ||
479 | return 0; | ||
480 | } | ||
481 | new->bse_length = end - new->bse_f_offset; | ||
482 | new->bse_devid = be->be_devid; | ||
483 | new->bse_mdev = be->be_mdev; | ||
484 | |||
485 | spin_lock(&bl->bl_ext_lock); | ||
486 | /* new will be freed, either by add_to_commitlist if it decides not | ||
487 | * to use it, or after LAYOUTCOMMIT uses it in the commitlist. | ||
488 | */ | ||
489 | add_to_commitlist(bl, new); | ||
490 | spin_unlock(&bl->bl_ext_lock); | ||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | static void print_bl_extent(struct pnfs_block_extent *be) | ||
495 | { | ||
496 | dprintk("PRINT EXTENT extent %p\n", be); | ||
497 | if (be) { | ||
498 | dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); | ||
499 | dprintk(" be_length %llu\n", (u64)be->be_length); | ||
500 | dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); | ||
501 | dprintk(" be_state %d\n", be->be_state); | ||
502 | } | ||
503 | } | ||
504 | |||
505 | static void | ||
506 | destroy_extent(struct kref *kref) | ||
507 | { | ||
508 | struct pnfs_block_extent *be; | ||
509 | |||
510 | be = container_of(kref, struct pnfs_block_extent, be_refcnt); | ||
511 | dprintk("%s be=%p\n", __func__, be); | ||
512 | kfree(be); | ||
513 | } | ||
514 | |||
515 | void | ||
516 | bl_put_extent(struct pnfs_block_extent *be) | ||
517 | { | ||
518 | if (be) { | ||
519 | dprintk("%s enter %p (%i)\n", __func__, be, | ||
520 | atomic_read(&be->be_refcnt.refcount)); | ||
521 | kref_put(&be->be_refcnt, destroy_extent); | ||
522 | } | ||
523 | } | ||
524 | |||
525 | struct pnfs_block_extent *bl_alloc_extent(void) | ||
526 | { | ||
527 | struct pnfs_block_extent *be; | ||
528 | |||
529 | be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); | ||
530 | if (!be) | ||
531 | return NULL; | ||
532 | INIT_LIST_HEAD(&be->be_node); | ||
533 | kref_init(&be->be_refcnt); | ||
534 | be->be_inval = NULL; | ||
535 | return be; | ||
536 | } | ||
537 | |||
538 | static void print_elist(struct list_head *list) | ||
539 | { | ||
540 | struct pnfs_block_extent *be; | ||
541 | dprintk("****************\n"); | ||
542 | dprintk("Extent list looks like:\n"); | ||
543 | list_for_each_entry(be, list, be_node) { | ||
544 | print_bl_extent(be); | ||
545 | } | ||
546 | dprintk("****************\n"); | ||
547 | } | ||
548 | |||
549 | static inline int | ||
550 | extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) | ||
551 | { | ||
552 | /* Note this assumes new->be_f_offset >= old->be_f_offset */ | ||
553 | return (new->be_state == old->be_state) && | ||
554 | ((new->be_state == PNFS_BLOCK_NONE_DATA) || | ||
555 | ((new->be_v_offset - old->be_v_offset == | ||
556 | new->be_f_offset - old->be_f_offset) && | ||
557 | new->be_mdev == old->be_mdev)); | ||
558 | } | ||
559 | |||
560 | /* Adds new to appropriate list in bl, modifying new and removing existing | ||
561 | * extents as appropriate to deal with overlaps. | ||
562 | * | ||
563 | * See bl_find_get_extent for list constraints. | ||
564 | * | ||
565 | * Refcount on new is already set. If end up not using it, or error out, | ||
566 | * need to put the reference. | ||
567 | * | ||
568 | * bl->bl_ext_lock is held by caller. | ||
569 | */ | ||
570 | int | ||
571 | bl_add_merge_extent(struct pnfs_block_layout *bl, | ||
572 | struct pnfs_block_extent *new) | ||
573 | { | ||
574 | struct pnfs_block_extent *be, *tmp; | ||
575 | sector_t end = new->be_f_offset + new->be_length; | ||
576 | struct list_head *list; | ||
577 | |||
578 | dprintk("%s enter with be=%p\n", __func__, new); | ||
579 | print_bl_extent(new); | ||
580 | list = &bl->bl_extents[bl_choose_list(new->be_state)]; | ||
581 | print_elist(list); | ||
582 | |||
583 | /* Scan for proper place to insert, extending new to the left | ||
584 | * as much as possible. | ||
585 | */ | ||
586 | list_for_each_entry_safe_reverse(be, tmp, list, be_node) { | ||
587 | if (new->be_f_offset >= be->be_f_offset + be->be_length) | ||
588 | break; | ||
589 | if (new->be_f_offset >= be->be_f_offset) { | ||
590 | if (end <= be->be_f_offset + be->be_length) { | ||
591 | /* new is a subset of existing be*/ | ||
592 | if (extents_consistent(be, new)) { | ||
593 | dprintk("%s: new is subset, ignoring\n", | ||
594 | __func__); | ||
595 | bl_put_extent(new); | ||
596 | return 0; | ||
597 | } else { | ||
598 | goto out_err; | ||
599 | } | ||
600 | } else { | ||
601 | /* |<-- be -->| | ||
602 | * |<-- new -->| */ | ||
603 | if (extents_consistent(be, new)) { | ||
604 | /* extend new to fully replace be */ | ||
605 | new->be_length += new->be_f_offset - | ||
606 | be->be_f_offset; | ||
607 | new->be_f_offset = be->be_f_offset; | ||
608 | new->be_v_offset = be->be_v_offset; | ||
609 | dprintk("%s: removing %p\n", __func__, be); | ||
610 | list_del(&be->be_node); | ||
611 | bl_put_extent(be); | ||
612 | } else { | ||
613 | goto out_err; | ||
614 | } | ||
615 | } | ||
616 | } else if (end >= be->be_f_offset + be->be_length) { | ||
617 | /* new extent overlap existing be */ | ||
618 | if (extents_consistent(be, new)) { | ||
619 | /* extend new to fully replace be */ | ||
620 | dprintk("%s: removing %p\n", __func__, be); | ||
621 | list_del(&be->be_node); | ||
622 | bl_put_extent(be); | ||
623 | } else { | ||
624 | goto out_err; | ||
625 | } | ||
626 | } else if (end > be->be_f_offset) { | ||
627 | /* |<-- be -->| | ||
628 | *|<-- new -->| */ | ||
629 | if (extents_consistent(new, be)) { | ||
630 | /* extend new to fully replace be */ | ||
631 | new->be_length += be->be_f_offset + be->be_length - | ||
632 | new->be_f_offset - new->be_length; | ||
633 | dprintk("%s: removing %p\n", __func__, be); | ||
634 | list_del(&be->be_node); | ||
635 | bl_put_extent(be); | ||
636 | } else { | ||
637 | goto out_err; | ||
638 | } | ||
639 | } | ||
640 | } | ||
641 | /* Note that if we never hit the above break, be will not point to a | ||
642 | * valid extent. However, in that case &be->be_node==list. | ||
643 | */ | ||
644 | list_add(&new->be_node, &be->be_node); | ||
645 | dprintk("%s: inserting new\n", __func__); | ||
646 | print_elist(list); | ||
647 | /* FIXME - The per-list consistency checks have all been done, | ||
648 | * should now check cross-list consistency. | ||
649 | */ | ||
650 | return 0; | ||
651 | |||
652 | out_err: | ||
653 | bl_put_extent(new); | ||
654 | return -EIO; | ||
655 | } | ||
656 | |||
657 | /* Returns extent, or NULL. If a second READ extent exists, it is returned | ||
658 | * in cow_read, if given. | ||
659 | * | ||
660 | * The extents are kept in two seperate ordered lists, one for READ and NONE, | ||
661 | * one for READWRITE and INVALID. Within each list, we assume: | ||
662 | * 1. Extents are ordered by file offset. | ||
663 | * 2. For any given isect, there is at most one extents that matches. | ||
664 | */ | ||
665 | struct pnfs_block_extent * | ||
666 | bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, | ||
667 | struct pnfs_block_extent **cow_read) | ||
668 | { | ||
669 | struct pnfs_block_extent *be, *cow, *ret; | ||
670 | int i; | ||
671 | |||
672 | dprintk("%s enter with isect %llu\n", __func__, (u64)isect); | ||
673 | cow = ret = NULL; | ||
674 | spin_lock(&bl->bl_ext_lock); | ||
675 | for (i = 0; i < EXTENT_LISTS; i++) { | ||
676 | list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { | ||
677 | if (isect >= be->be_f_offset + be->be_length) | ||
678 | break; | ||
679 | if (isect >= be->be_f_offset) { | ||
680 | /* We have found an extent */ | ||
681 | dprintk("%s Get %p (%i)\n", __func__, be, | ||
682 | atomic_read(&be->be_refcnt.refcount)); | ||
683 | kref_get(&be->be_refcnt); | ||
684 | if (!ret) | ||
685 | ret = be; | ||
686 | else if (be->be_state != PNFS_BLOCK_READ_DATA) | ||
687 | bl_put_extent(be); | ||
688 | else | ||
689 | cow = be; | ||
690 | break; | ||
691 | } | ||
692 | } | ||
693 | if (ret && | ||
694 | (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) | ||
695 | break; | ||
696 | } | ||
697 | spin_unlock(&bl->bl_ext_lock); | ||
698 | if (cow_read) | ||
699 | *cow_read = cow; | ||
700 | print_bl_extent(ret); | ||
701 | return ret; | ||
702 | } | ||
703 | |||
704 | /* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ | ||
705 | static struct pnfs_block_extent * | ||
706 | bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) | ||
707 | { | ||
708 | struct pnfs_block_extent *be, *ret = NULL; | ||
709 | int i; | ||
710 | |||
711 | dprintk("%s enter with isect %llu\n", __func__, (u64)isect); | ||
712 | for (i = 0; i < EXTENT_LISTS; i++) { | ||
713 | if (ret) | ||
714 | break; | ||
715 | list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { | ||
716 | if (isect >= be->be_f_offset + be->be_length) | ||
717 | break; | ||
718 | if (isect >= be->be_f_offset) { | ||
719 | /* We have found an extent */ | ||
720 | dprintk("%s Get %p (%i)\n", __func__, be, | ||
721 | atomic_read(&be->be_refcnt.refcount)); | ||
722 | kref_get(&be->be_refcnt); | ||
723 | ret = be; | ||
724 | break; | ||
725 | } | ||
726 | } | ||
727 | } | ||
728 | print_bl_extent(ret); | ||
729 | return ret; | ||
730 | } | ||
731 | |||
732 | int | ||
733 | encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, | ||
734 | struct xdr_stream *xdr, | ||
735 | const struct nfs4_layoutcommit_args *arg) | ||
736 | { | ||
737 | struct pnfs_block_short_extent *lce, *save; | ||
738 | unsigned int count = 0; | ||
739 | __be32 *p, *xdr_start; | ||
740 | |||
741 | dprintk("%s enter\n", __func__); | ||
742 | /* BUG - creation of bl_commit is buggy - need to wait for | ||
743 | * entire block to be marked WRITTEN before it can be added. | ||
744 | */ | ||
745 | spin_lock(&bl->bl_ext_lock); | ||
746 | /* Want to adjust for possible truncate */ | ||
747 | /* We now want to adjust argument range */ | ||
748 | |||
749 | /* XDR encode the ranges found */ | ||
750 | xdr_start = xdr_reserve_space(xdr, 8); | ||
751 | if (!xdr_start) | ||
752 | goto out; | ||
753 | list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { | ||
754 | p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); | ||
755 | if (!p) | ||
756 | break; | ||
757 | p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); | ||
758 | p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); | ||
759 | p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); | ||
760 | p = xdr_encode_hyper(p, 0LL); | ||
761 | *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); | ||
762 | list_del(&lce->bse_node); | ||
763 | list_add_tail(&lce->bse_node, &bl->bl_committing); | ||
764 | bl->bl_count--; | ||
765 | count++; | ||
766 | } | ||
767 | xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); | ||
768 | xdr_start[1] = cpu_to_be32(count); | ||
769 | out: | ||
770 | spin_unlock(&bl->bl_ext_lock); | ||
771 | dprintk("%s found %i ranges\n", __func__, count); | ||
772 | return 0; | ||
773 | } | ||
774 | |||
775 | /* Helper function to set_to_rw that initialize a new extent */ | ||
776 | static void | ||
777 | _prep_new_extent(struct pnfs_block_extent *new, | ||
778 | struct pnfs_block_extent *orig, | ||
779 | sector_t offset, sector_t length, int state) | ||
780 | { | ||
781 | kref_init(&new->be_refcnt); | ||
782 | /* don't need to INIT_LIST_HEAD(&new->be_node) */ | ||
783 | memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); | ||
784 | new->be_mdev = orig->be_mdev; | ||
785 | new->be_f_offset = offset; | ||
786 | new->be_length = length; | ||
787 | new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; | ||
788 | new->be_state = state; | ||
789 | new->be_inval = orig->be_inval; | ||
790 | } | ||
791 | |||
792 | /* Tries to merge be with extent in front of it in list. | ||
793 | * Frees storage if not used. | ||
794 | */ | ||
795 | static struct pnfs_block_extent * | ||
796 | _front_merge(struct pnfs_block_extent *be, struct list_head *head, | ||
797 | struct pnfs_block_extent *storage) | ||
798 | { | ||
799 | struct pnfs_block_extent *prev; | ||
800 | |||
801 | if (!storage) | ||
802 | goto no_merge; | ||
803 | if (&be->be_node == head || be->be_node.prev == head) | ||
804 | goto no_merge; | ||
805 | prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); | ||
806 | if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || | ||
807 | !extents_consistent(prev, be)) | ||
808 | goto no_merge; | ||
809 | _prep_new_extent(storage, prev, prev->be_f_offset, | ||
810 | prev->be_length + be->be_length, prev->be_state); | ||
811 | list_replace(&prev->be_node, &storage->be_node); | ||
812 | bl_put_extent(prev); | ||
813 | list_del(&be->be_node); | ||
814 | bl_put_extent(be); | ||
815 | return storage; | ||
816 | |||
817 | no_merge: | ||
818 | kfree(storage); | ||
819 | return be; | ||
820 | } | ||
821 | |||
822 | static u64 | ||
823 | set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) | ||
824 | { | ||
825 | u64 rv = offset + length; | ||
826 | struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; | ||
827 | struct pnfs_block_extent *children[3]; | ||
828 | struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; | ||
829 | int i = 0, j; | ||
830 | |||
831 | dprintk("%s(%llu, %llu)\n", __func__, offset, length); | ||
832 | /* Create storage for up to three new extents e1, e2, e3 */ | ||
833 | e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); | ||
834 | e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); | ||
835 | e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); | ||
836 | /* BUG - we are ignoring any failure */ | ||
837 | if (!e1 || !e2 || !e3) | ||
838 | goto out_nosplit; | ||
839 | |||
840 | spin_lock(&bl->bl_ext_lock); | ||
841 | be = bl_find_get_extent_locked(bl, offset); | ||
842 | rv = be->be_f_offset + be->be_length; | ||
843 | if (be->be_state != PNFS_BLOCK_INVALID_DATA) { | ||
844 | spin_unlock(&bl->bl_ext_lock); | ||
845 | goto out_nosplit; | ||
846 | } | ||
847 | /* Add e* to children, bumping e*'s krefs */ | ||
848 | if (be->be_f_offset != offset) { | ||
849 | _prep_new_extent(e1, be, be->be_f_offset, | ||
850 | offset - be->be_f_offset, | ||
851 | PNFS_BLOCK_INVALID_DATA); | ||
852 | children[i++] = e1; | ||
853 | print_bl_extent(e1); | ||
854 | } else | ||
855 | merge1 = e1; | ||
856 | _prep_new_extent(e2, be, offset, | ||
857 | min(length, be->be_f_offset + be->be_length - offset), | ||
858 | PNFS_BLOCK_READWRITE_DATA); | ||
859 | children[i++] = e2; | ||
860 | print_bl_extent(e2); | ||
861 | if (offset + length < be->be_f_offset + be->be_length) { | ||
862 | _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, | ||
863 | be->be_f_offset + be->be_length - | ||
864 | offset - length, | ||
865 | PNFS_BLOCK_INVALID_DATA); | ||
866 | children[i++] = e3; | ||
867 | print_bl_extent(e3); | ||
868 | } else | ||
869 | merge2 = e3; | ||
870 | |||
871 | /* Remove be from list, and insert the e* */ | ||
872 | /* We don't get refs on e*, since this list is the base reference | ||
873 | * set when init'ed. | ||
874 | */ | ||
875 | if (i < 3) | ||
876 | children[i] = NULL; | ||
877 | new = children[0]; | ||
878 | list_replace(&be->be_node, &new->be_node); | ||
879 | bl_put_extent(be); | ||
880 | new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); | ||
881 | for (j = 1; j < i; j++) { | ||
882 | old = new; | ||
883 | new = children[j]; | ||
884 | list_add(&new->be_node, &old->be_node); | ||
885 | } | ||
886 | if (merge2) { | ||
887 | /* This is a HACK, should just create a _back_merge function */ | ||
888 | new = list_entry(new->be_node.next, | ||
889 | struct pnfs_block_extent, be_node); | ||
890 | new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); | ||
891 | } | ||
892 | spin_unlock(&bl->bl_ext_lock); | ||
893 | |||
894 | /* Since we removed the base reference above, be is now scheduled for | ||
895 | * destruction. | ||
896 | */ | ||
897 | bl_put_extent(be); | ||
898 | dprintk("%s returns %llu after split\n", __func__, rv); | ||
899 | return rv; | ||
900 | |||
901 | out_nosplit: | ||
902 | kfree(e1); | ||
903 | kfree(e2); | ||
904 | kfree(e3); | ||
905 | dprintk("%s returns %llu without splitting\n", __func__, rv); | ||
906 | return rv; | ||
907 | } | ||
908 | |||
909 | void | ||
910 | clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, | ||
911 | const struct nfs4_layoutcommit_args *arg, | ||
912 | int status) | ||
913 | { | ||
914 | struct pnfs_block_short_extent *lce, *save; | ||
915 | |||
916 | dprintk("%s status %d\n", __func__, status); | ||
917 | list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { | ||
918 | if (likely(!status)) { | ||
919 | u64 offset = lce->bse_f_offset; | ||
920 | u64 end = offset + lce->bse_length; | ||
921 | |||
922 | do { | ||
923 | offset = set_to_rw(bl, offset, end - offset); | ||
924 | } while (offset < end); | ||
925 | list_del(&lce->bse_node); | ||
926 | |||
927 | kfree(lce); | ||
928 | } else { | ||
929 | list_del(&lce->bse_node); | ||
930 | spin_lock(&bl->bl_ext_lock); | ||
931 | add_to_commitlist(bl, lce); | ||
932 | spin_unlock(&bl->bl_ext_lock); | ||
933 | } | ||
934 | } | ||
935 | } | ||
diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 19ea7d9c75e6..5833fbbf59b0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c | |||
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = { | |||
105 | .nrvers = ARRAY_SIZE(nfs_version), | 105 | .nrvers = ARRAY_SIZE(nfs_version), |
106 | .version = nfs_version, | 106 | .version = nfs_version, |
107 | .stats = &nfs_rpcstat, | 107 | .stats = &nfs_rpcstat, |
108 | .pipe_dir_name = "/nfs", | 108 | .pipe_dir_name = NFS_PIPE_DIRNAME, |
109 | }; | 109 | }; |
110 | 110 | ||
111 | struct rpc_stat nfs_rpcstat = { | 111 | struct rpc_stat nfs_rpcstat = { |
@@ -904,7 +904,9 @@ error: | |||
904 | /* | 904 | /* |
905 | * Load up the server record from information gained in an fsinfo record | 905 | * Load up the server record from information gained in an fsinfo record |
906 | */ | 906 | */ |
907 | static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) | 907 | static void nfs_server_set_fsinfo(struct nfs_server *server, |
908 | struct nfs_fh *mntfh, | ||
909 | struct nfs_fsinfo *fsinfo) | ||
908 | { | 910 | { |
909 | unsigned long max_rpc_payload; | 911 | unsigned long max_rpc_payload; |
910 | 912 | ||
@@ -934,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * | |||
934 | if (server->wsize > NFS_MAX_FILE_IO_SIZE) | 936 | if (server->wsize > NFS_MAX_FILE_IO_SIZE) |
935 | server->wsize = NFS_MAX_FILE_IO_SIZE; | 937 | server->wsize = NFS_MAX_FILE_IO_SIZE; |
936 | server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 938 | server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
937 | set_pnfs_layoutdriver(server, fsinfo->layouttype); | 939 | server->pnfs_blksize = fsinfo->blksize; |
940 | set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); | ||
938 | 941 | ||
939 | server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); | 942 | server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); |
940 | 943 | ||
@@ -980,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str | |||
980 | if (error < 0) | 983 | if (error < 0) |
981 | goto out_error; | 984 | goto out_error; |
982 | 985 | ||
983 | nfs_server_set_fsinfo(server, &fsinfo); | 986 | nfs_server_set_fsinfo(server, mntfh, &fsinfo); |
984 | 987 | ||
985 | /* Get some general file system info */ | 988 | /* Get some general file system info */ |
986 | if (server->namelen == 0) { | 989 | if (server->namelen == 0) { |
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 57f578e2560a..b238d95ac48c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c | |||
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = { | |||
134 | 134 | ||
135 | #endif /* CONFIG_NFS_V4 */ | 135 | #endif /* CONFIG_NFS_V4 */ |
136 | 136 | ||
137 | static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) | 137 | static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) |
138 | { | 138 | { |
139 | struct nfs_open_dir_context *ctx; | 139 | struct nfs_open_dir_context *ctx; |
140 | ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); | 140 | ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); |
141 | if (ctx != NULL) { | 141 | if (ctx != NULL) { |
142 | ctx->duped = 0; | 142 | ctx->duped = 0; |
143 | ctx->attr_gencount = NFS_I(dir)->attr_gencount; | ||
143 | ctx->dir_cookie = 0; | 144 | ctx->dir_cookie = 0; |
144 | ctx->dup_cookie = 0; | 145 | ctx->dup_cookie = 0; |
145 | ctx->cred = get_rpccred(cred); | 146 | ctx->cred = get_rpccred(cred); |
146 | } else | 147 | return ctx; |
147 | ctx = ERR_PTR(-ENOMEM); | 148 | } |
148 | return ctx; | 149 | return ERR_PTR(-ENOMEM); |
149 | } | 150 | } |
150 | 151 | ||
151 | static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) | 152 | static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) |
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp) | |||
173 | cred = rpc_lookup_cred(); | 174 | cred = rpc_lookup_cred(); |
174 | if (IS_ERR(cred)) | 175 | if (IS_ERR(cred)) |
175 | return PTR_ERR(cred); | 176 | return PTR_ERR(cred); |
176 | ctx = alloc_nfs_open_dir_context(cred); | 177 | ctx = alloc_nfs_open_dir_context(inode, cred); |
177 | if (IS_ERR(ctx)) { | 178 | if (IS_ERR(ctx)) { |
178 | res = PTR_ERR(ctx); | 179 | res = PTR_ERR(ctx); |
179 | goto out; | 180 | goto out; |
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri | |||
323 | { | 324 | { |
324 | loff_t diff = desc->file->f_pos - desc->current_index; | 325 | loff_t diff = desc->file->f_pos - desc->current_index; |
325 | unsigned int index; | 326 | unsigned int index; |
326 | struct nfs_open_dir_context *ctx = desc->file->private_data; | ||
327 | 327 | ||
328 | if (diff < 0) | 328 | if (diff < 0) |
329 | goto out_eof; | 329 | goto out_eof; |
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri | |||
336 | index = (unsigned int)diff; | 336 | index = (unsigned int)diff; |
337 | *desc->dir_cookie = array->array[index].cookie; | 337 | *desc->dir_cookie = array->array[index].cookie; |
338 | desc->cache_entry_index = index; | 338 | desc->cache_entry_index = index; |
339 | ctx->duped = 0; | ||
340 | return 0; | 339 | return 0; |
341 | out_eof: | 340 | out_eof: |
342 | desc->eof = 1; | 341 | desc->eof = 1; |
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des | |||
349 | int i; | 348 | int i; |
350 | loff_t new_pos; | 349 | loff_t new_pos; |
351 | int status = -EAGAIN; | 350 | int status = -EAGAIN; |
352 | struct nfs_open_dir_context *ctx = desc->file->private_data; | ||
353 | 351 | ||
354 | for (i = 0; i < array->size; i++) { | 352 | for (i = 0; i < array->size; i++) { |
355 | if (array->array[i].cookie == *desc->dir_cookie) { | 353 | if (array->array[i].cookie == *desc->dir_cookie) { |
354 | struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); | ||
355 | struct nfs_open_dir_context *ctx = desc->file->private_data; | ||
356 | |||
356 | new_pos = desc->current_index + i; | 357 | new_pos = desc->current_index + i; |
357 | if (new_pos < desc->file->f_pos) { | 358 | if (ctx->attr_gencount != nfsi->attr_gencount |
359 | || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { | ||
360 | ctx->duped = 0; | ||
361 | ctx->attr_gencount = nfsi->attr_gencount; | ||
362 | } else if (new_pos < desc->file->f_pos) { | ||
363 | if (ctx->duped > 0 | ||
364 | && ctx->dup_cookie == *desc->dir_cookie) { | ||
365 | if (printk_ratelimit()) { | ||
366 | pr_notice("NFS: directory %s/%s contains a readdir loop." | ||
367 | "Please contact your server vendor. " | ||
368 | "The file: %s has duplicate cookie %llu\n", | ||
369 | desc->file->f_dentry->d_parent->d_name.name, | ||
370 | desc->file->f_dentry->d_name.name, | ||
371 | array->array[i].string.name, | ||
372 | *desc->dir_cookie); | ||
373 | } | ||
374 | status = -ELOOP; | ||
375 | goto out; | ||
376 | } | ||
358 | ctx->dup_cookie = *desc->dir_cookie; | 377 | ctx->dup_cookie = *desc->dir_cookie; |
359 | ctx->duped = 1; | 378 | ctx->duped = -1; |
360 | } | 379 | } |
361 | desc->file->f_pos = new_pos; | 380 | desc->file->f_pos = new_pos; |
362 | desc->cache_entry_index = i; | 381 | desc->cache_entry_index = i; |
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des | |||
368 | if (*desc->dir_cookie == array->last_cookie) | 387 | if (*desc->dir_cookie == array->last_cookie) |
369 | desc->eof = 1; | 388 | desc->eof = 1; |
370 | } | 389 | } |
390 | out: | ||
371 | return status; | 391 | return status; |
372 | } | 392 | } |
373 | 393 | ||
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, | |||
740 | struct nfs_cache_array *array = NULL; | 760 | struct nfs_cache_array *array = NULL; |
741 | struct nfs_open_dir_context *ctx = file->private_data; | 761 | struct nfs_open_dir_context *ctx = file->private_data; |
742 | 762 | ||
743 | if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { | ||
744 | if (printk_ratelimit()) { | ||
745 | pr_notice("NFS: directory %s/%s contains a readdir loop. " | ||
746 | "Please contact your server vendor. " | ||
747 | "Offending cookie: %llu\n", | ||
748 | file->f_dentry->d_parent->d_name.name, | ||
749 | file->f_dentry->d_name.name, | ||
750 | *desc->dir_cookie); | ||
751 | } | ||
752 | res = -ELOOP; | ||
753 | goto out; | ||
754 | } | ||
755 | |||
756 | array = nfs_readdir_get_array(desc->page); | 763 | array = nfs_readdir_get_array(desc->page); |
757 | if (IS_ERR(array)) { | 764 | if (IS_ERR(array)) { |
758 | res = PTR_ERR(array); | 765 | res = PTR_ERR(array); |
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, | |||
774 | *desc->dir_cookie = array->array[i+1].cookie; | 781 | *desc->dir_cookie = array->array[i+1].cookie; |
775 | else | 782 | else |
776 | *desc->dir_cookie = array->last_cookie; | 783 | *desc->dir_cookie = array->last_cookie; |
784 | if (ctx->duped != 0) | ||
785 | ctx->duped = 1; | ||
777 | } | 786 | } |
778 | if (array->eof_index >= 0) | 787 | if (array->eof_index >= 0) |
779 | desc->eof = 1; | 788 | desc->eof = 1; |
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, | |||
805 | struct page *page = NULL; | 814 | struct page *page = NULL; |
806 | int status; | 815 | int status; |
807 | struct inode *inode = desc->file->f_path.dentry->d_inode; | 816 | struct inode *inode = desc->file->f_path.dentry->d_inode; |
817 | struct nfs_open_dir_context *ctx = desc->file->private_data; | ||
808 | 818 | ||
809 | dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", | 819 | dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", |
810 | (unsigned long long)*desc->dir_cookie); | 820 | (unsigned long long)*desc->dir_cookie); |
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, | |||
818 | desc->page_index = 0; | 828 | desc->page_index = 0; |
819 | desc->last_cookie = *desc->dir_cookie; | 829 | desc->last_cookie = *desc->dir_cookie; |
820 | desc->page = page; | 830 | desc->page = page; |
831 | ctx->duped = 0; | ||
821 | 832 | ||
822 | status = nfs_readdir_xdr_to_array(desc, page, inode); | 833 | status = nfs_readdir_xdr_to_array(desc, page, inode); |
823 | if (status < 0) | 834 | if (status < 0) |
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 1909ee8be350..1ec1a85fa71c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h | |||
@@ -318,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; | |||
318 | extern const u32 nfs4_fattr_bitmap[2]; | 318 | extern const u32 nfs4_fattr_bitmap[2]; |
319 | extern const u32 nfs4_statfs_bitmap[2]; | 319 | extern const u32 nfs4_statfs_bitmap[2]; |
320 | extern const u32 nfs4_pathconf_bitmap[2]; | 320 | extern const u32 nfs4_pathconf_bitmap[2]; |
321 | extern const u32 nfs4_fsinfo_bitmap[2]; | 321 | extern const u32 nfs4_fsinfo_bitmap[3]; |
322 | extern const u32 nfs4_fs_locations_bitmap[2]; | 322 | extern const u32 nfs4_fs_locations_bitmap[2]; |
323 | 323 | ||
324 | /* nfs4renewd.c */ | 324 | /* nfs4renewd.c */ |
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index be93a622872c..e8915d4840ad 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c | |||
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) | |||
170 | 170 | ||
171 | pnfs_set_layoutcommit(wdata); | 171 | pnfs_set_layoutcommit(wdata); |
172 | dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, | 172 | dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, |
173 | (unsigned long) wdata->lseg->pls_end_pos); | 173 | (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); |
174 | } | 174 | } |
175 | 175 | ||
176 | /* | 176 | /* |
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 079614deca3f..8c77039e7a81 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c | |||
@@ -140,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = { | |||
140 | 0 | 140 | 0 |
141 | }; | 141 | }; |
142 | 142 | ||
143 | const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE | 143 | const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE |
144 | | FATTR4_WORD0_MAXREAD | 144 | | FATTR4_WORD0_MAXREAD |
145 | | FATTR4_WORD0_MAXWRITE | 145 | | FATTR4_WORD0_MAXWRITE |
146 | | FATTR4_WORD0_LEASE_TIME, | 146 | | FATTR4_WORD0_LEASE_TIME, |
147 | FATTR4_WORD1_TIME_DELTA | 147 | FATTR4_WORD1_TIME_DELTA |
148 | | FATTR4_WORD1_FS_LAYOUT_TYPES | 148 | | FATTR4_WORD1_FS_LAYOUT_TYPES, |
149 | FATTR4_WORD2_LAYOUT_BLKSIZE | ||
149 | }; | 150 | }; |
150 | 151 | ||
151 | const u32 nfs4_fs_locations_bitmap[2] = { | 152 | const u32 nfs4_fs_locations_bitmap[2] = { |
@@ -5834,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) | |||
5834 | return status; | 5835 | return status; |
5835 | } | 5836 | } |
5836 | 5837 | ||
5838 | /* | ||
5839 | * Retrieve the list of Data Server devices from the MDS. | ||
5840 | */ | ||
5841 | static int _nfs4_getdevicelist(struct nfs_server *server, | ||
5842 | const struct nfs_fh *fh, | ||
5843 | struct pnfs_devicelist *devlist) | ||
5844 | { | ||
5845 | struct nfs4_getdevicelist_args args = { | ||
5846 | .fh = fh, | ||
5847 | .layoutclass = server->pnfs_curr_ld->id, | ||
5848 | }; | ||
5849 | struct nfs4_getdevicelist_res res = { | ||
5850 | .devlist = devlist, | ||
5851 | }; | ||
5852 | struct rpc_message msg = { | ||
5853 | .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], | ||
5854 | .rpc_argp = &args, | ||
5855 | .rpc_resp = &res, | ||
5856 | }; | ||
5857 | int status; | ||
5858 | |||
5859 | dprintk("--> %s\n", __func__); | ||
5860 | status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, | ||
5861 | &res.seq_res, 0); | ||
5862 | dprintk("<-- %s status=%d\n", __func__, status); | ||
5863 | return status; | ||
5864 | } | ||
5865 | |||
5866 | int nfs4_proc_getdevicelist(struct nfs_server *server, | ||
5867 | const struct nfs_fh *fh, | ||
5868 | struct pnfs_devicelist *devlist) | ||
5869 | { | ||
5870 | struct nfs4_exception exception = { }; | ||
5871 | int err; | ||
5872 | |||
5873 | do { | ||
5874 | err = nfs4_handle_exception(server, | ||
5875 | _nfs4_getdevicelist(server, fh, devlist), | ||
5876 | &exception); | ||
5877 | } while (exception.retry); | ||
5878 | |||
5879 | dprintk("%s: err=%d, num_devs=%u\n", __func__, | ||
5880 | err, devlist->num_devs); | ||
5881 | |||
5882 | return err; | ||
5883 | } | ||
5884 | EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); | ||
5885 | |||
5837 | static int | 5886 | static int |
5838 | _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) | 5887 | _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) |
5839 | { | 5888 | { |
@@ -5912,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) | |||
5912 | static void nfs4_layoutcommit_release(void *calldata) | 5961 | static void nfs4_layoutcommit_release(void *calldata) |
5913 | { | 5962 | { |
5914 | struct nfs4_layoutcommit_data *data = calldata; | 5963 | struct nfs4_layoutcommit_data *data = calldata; |
5964 | struct pnfs_layout_segment *lseg, *tmp; | ||
5915 | 5965 | ||
5966 | pnfs_cleanup_layoutcommit(data); | ||
5916 | /* Matched by references in pnfs_set_layoutcommit */ | 5967 | /* Matched by references in pnfs_set_layoutcommit */ |
5917 | put_lseg(data->lseg); | 5968 | list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { |
5969 | list_del_init(&lseg->pls_lc_list); | ||
5970 | if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, | ||
5971 | &lseg->pls_flags)) | ||
5972 | put_lseg(lseg); | ||
5973 | } | ||
5918 | put_rpccred(data->cred); | 5974 | put_rpccred(data->cred); |
5919 | kfree(data); | 5975 | kfree(data); |
5920 | } | 5976 | } |
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c191a9baa422..1dce12f41a4f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c | |||
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int); | |||
113 | #define encode_restorefh_maxsz (op_encode_hdr_maxsz) | 113 | #define encode_restorefh_maxsz (op_encode_hdr_maxsz) |
114 | #define decode_restorefh_maxsz (op_decode_hdr_maxsz) | 114 | #define decode_restorefh_maxsz (op_decode_hdr_maxsz) |
115 | #define encode_fsinfo_maxsz (encode_getattr_maxsz) | 115 | #define encode_fsinfo_maxsz (encode_getattr_maxsz) |
116 | #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) | 116 | /* The 5 accounts for the PNFS attributes, and assumes that at most three |
117 | * layout types will be returned. | ||
118 | */ | ||
119 | #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ | ||
120 | nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) | ||
117 | #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) | 121 | #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) |
118 | #define decode_renew_maxsz (op_decode_hdr_maxsz) | 122 | #define decode_renew_maxsz (op_decode_hdr_maxsz) |
119 | #define encode_setclientid_maxsz \ | 123 | #define encode_setclientid_maxsz \ |
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int); | |||
314 | XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) | 318 | XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) |
315 | #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) | 319 | #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) |
316 | #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) | 320 | #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) |
321 | #define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ | ||
322 | encode_verifier_maxsz) | ||
323 | #define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ | ||
324 | 2 /* nfs_cookie4 gdlr_cookie */ + \ | ||
325 | decode_verifier_maxsz \ | ||
326 | /* verifier4 gdlr_verifier */ + \ | ||
327 | 1 /* gdlr_deviceid_list count */ + \ | ||
328 | XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ | ||
329 | NFS4_DEVICEID4_SIZE) \ | ||
330 | /* gdlr_deviceid_list */ + \ | ||
331 | 1 /* bool gdlr_eof */) | ||
317 | #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ | 332 | #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ |
318 | XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) | 333 | XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) |
319 | #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ | 334 | #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ |
@@ -748,6 +763,14 @@ static int nfs4_stat_to_errno(int); | |||
748 | #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ | 763 | #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ |
749 | decode_sequence_maxsz + \ | 764 | decode_sequence_maxsz + \ |
750 | decode_reclaim_complete_maxsz) | 765 | decode_reclaim_complete_maxsz) |
766 | #define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ | ||
767 | encode_sequence_maxsz + \ | ||
768 | encode_putfh_maxsz + \ | ||
769 | encode_getdevicelist_maxsz) | ||
770 | #define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ | ||
771 | decode_sequence_maxsz + \ | ||
772 | decode_putfh_maxsz + \ | ||
773 | decode_getdevicelist_maxsz) | ||
751 | #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ | 774 | #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ |
752 | encode_sequence_maxsz +\ | 775 | encode_sequence_maxsz +\ |
753 | encode_getdeviceinfo_maxsz) | 776 | encode_getdeviceinfo_maxsz) |
@@ -1104,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm | |||
1104 | hdr->replen += decode_getattr_maxsz; | 1127 | hdr->replen += decode_getattr_maxsz; |
1105 | } | 1128 | } |
1106 | 1129 | ||
1130 | static void | ||
1131 | encode_getattr_three(struct xdr_stream *xdr, | ||
1132 | uint32_t bm0, uint32_t bm1, uint32_t bm2, | ||
1133 | struct compound_hdr *hdr) | ||
1134 | { | ||
1135 | __be32 *p; | ||
1136 | |||
1137 | p = reserve_space(xdr, 4); | ||
1138 | *p = cpu_to_be32(OP_GETATTR); | ||
1139 | if (bm2) { | ||
1140 | p = reserve_space(xdr, 16); | ||
1141 | *p++ = cpu_to_be32(3); | ||
1142 | *p++ = cpu_to_be32(bm0); | ||
1143 | *p++ = cpu_to_be32(bm1); | ||
1144 | *p = cpu_to_be32(bm2); | ||
1145 | } else if (bm1) { | ||
1146 | p = reserve_space(xdr, 12); | ||
1147 | *p++ = cpu_to_be32(2); | ||
1148 | *p++ = cpu_to_be32(bm0); | ||
1149 | *p = cpu_to_be32(bm1); | ||
1150 | } else { | ||
1151 | p = reserve_space(xdr, 8); | ||
1152 | *p++ = cpu_to_be32(1); | ||
1153 | *p = cpu_to_be32(bm0); | ||
1154 | } | ||
1155 | hdr->nops++; | ||
1156 | hdr->replen += decode_getattr_maxsz; | ||
1157 | } | ||
1158 | |||
1107 | static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) | 1159 | static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) |
1108 | { | 1160 | { |
1109 | encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], | 1161 | encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], |
@@ -1112,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c | |||
1112 | 1164 | ||
1113 | static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) | 1165 | static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) |
1114 | { | 1166 | { |
1115 | encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], | 1167 | encode_getattr_three(xdr, |
1116 | bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); | 1168 | bitmask[0] & nfs4_fsinfo_bitmap[0], |
1169 | bitmask[1] & nfs4_fsinfo_bitmap[1], | ||
1170 | bitmask[2] & nfs4_fsinfo_bitmap[2], | ||
1171 | hdr); | ||
1117 | } | 1172 | } |
1118 | 1173 | ||
1119 | static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) | 1174 | static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) |
@@ -1855,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr, | |||
1855 | 1910 | ||
1856 | #ifdef CONFIG_NFS_V4_1 | 1911 | #ifdef CONFIG_NFS_V4_1 |
1857 | static void | 1912 | static void |
1913 | encode_getdevicelist(struct xdr_stream *xdr, | ||
1914 | const struct nfs4_getdevicelist_args *args, | ||
1915 | struct compound_hdr *hdr) | ||
1916 | { | ||
1917 | __be32 *p; | ||
1918 | nfs4_verifier dummy = { | ||
1919 | .data = "dummmmmy", | ||
1920 | }; | ||
1921 | |||
1922 | p = reserve_space(xdr, 20); | ||
1923 | *p++ = cpu_to_be32(OP_GETDEVICELIST); | ||
1924 | *p++ = cpu_to_be32(args->layoutclass); | ||
1925 | *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); | ||
1926 | xdr_encode_hyper(p, 0ULL); /* cookie */ | ||
1927 | encode_nfs4_verifier(xdr, &dummy); | ||
1928 | hdr->nops++; | ||
1929 | hdr->replen += decode_getdevicelist_maxsz; | ||
1930 | } | ||
1931 | |||
1932 | static void | ||
1858 | encode_getdeviceinfo(struct xdr_stream *xdr, | 1933 | encode_getdeviceinfo(struct xdr_stream *xdr, |
1859 | const struct nfs4_getdeviceinfo_args *args, | 1934 | const struct nfs4_getdeviceinfo_args *args, |
1860 | struct compound_hdr *hdr) | 1935 | struct compound_hdr *hdr) |
@@ -1916,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr, | |||
1916 | *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); | 1991 | *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); |
1917 | /* Only whole file layouts */ | 1992 | /* Only whole file layouts */ |
1918 | p = xdr_encode_hyper(p, 0); /* offset */ | 1993 | p = xdr_encode_hyper(p, 0); /* offset */ |
1919 | p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ | 1994 | p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ |
1920 | *p++ = cpu_to_be32(0); /* reclaim */ | 1995 | *p++ = cpu_to_be32(0); /* reclaim */ |
1921 | p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); | 1996 | p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); |
1922 | *p++ = cpu_to_be32(1); /* newoffset = TRUE */ | 1997 | *p++ = cpu_to_be32(1); /* newoffset = TRUE */ |
@@ -2604,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, | |||
2604 | struct compound_hdr hdr = { | 2679 | struct compound_hdr hdr = { |
2605 | .nops = 0, | 2680 | .nops = 0, |
2606 | }; | 2681 | }; |
2607 | const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; | 2682 | const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; |
2608 | 2683 | ||
2609 | encode_compound_hdr(xdr, req, &hdr); | 2684 | encode_compound_hdr(xdr, req, &hdr); |
2610 | encode_setclientid_confirm(xdr, arg, &hdr); | 2685 | encode_setclientid_confirm(xdr, arg, &hdr); |
@@ -2748,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, | |||
2748 | struct compound_hdr hdr = { | 2823 | struct compound_hdr hdr = { |
2749 | .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), | 2824 | .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), |
2750 | }; | 2825 | }; |
2751 | const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; | 2826 | const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; |
2752 | 2827 | ||
2753 | encode_compound_hdr(xdr, req, &hdr); | 2828 | encode_compound_hdr(xdr, req, &hdr); |
2754 | encode_sequence(xdr, &args->la_seq_args, &hdr); | 2829 | encode_sequence(xdr, &args->la_seq_args, &hdr); |
@@ -2775,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, | |||
2775 | } | 2850 | } |
2776 | 2851 | ||
2777 | /* | 2852 | /* |
2853 | * Encode GETDEVICELIST request | ||
2854 | */ | ||
2855 | static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, | ||
2856 | struct xdr_stream *xdr, | ||
2857 | struct nfs4_getdevicelist_args *args) | ||
2858 | { | ||
2859 | struct compound_hdr hdr = { | ||
2860 | .minorversion = nfs4_xdr_minorversion(&args->seq_args), | ||
2861 | }; | ||
2862 | |||
2863 | encode_compound_hdr(xdr, req, &hdr); | ||
2864 | encode_sequence(xdr, &args->seq_args, &hdr); | ||
2865 | encode_putfh(xdr, args->fh, &hdr); | ||
2866 | encode_getdevicelist(xdr, args, &hdr); | ||
2867 | encode_nops(&hdr); | ||
2868 | } | ||
2869 | |||
2870 | /* | ||
2778 | * Encode GETDEVICEINFO request | 2871 | * Encode GETDEVICEINFO request |
2779 | */ | 2872 | */ |
2780 | static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, | 2873 | static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, |
@@ -3011,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) | |||
3011 | goto out_overflow; | 3104 | goto out_overflow; |
3012 | bmlen = be32_to_cpup(p); | 3105 | bmlen = be32_to_cpup(p); |
3013 | 3106 | ||
3014 | bitmap[0] = bitmap[1] = 0; | 3107 | bitmap[0] = bitmap[1] = bitmap[2] = 0; |
3015 | p = xdr_inline_decode(xdr, (bmlen << 2)); | 3108 | p = xdr_inline_decode(xdr, (bmlen << 2)); |
3016 | if (unlikely(!p)) | 3109 | if (unlikely(!p)) |
3017 | goto out_overflow; | 3110 | goto out_overflow; |
3018 | if (bmlen > 0) { | 3111 | if (bmlen > 0) { |
3019 | bitmap[0] = be32_to_cpup(p++); | 3112 | bitmap[0] = be32_to_cpup(p++); |
3020 | if (bmlen > 1) | 3113 | if (bmlen > 1) { |
3021 | bitmap[1] = be32_to_cpup(p); | 3114 | bitmap[1] = be32_to_cpup(p++); |
3115 | if (bmlen > 2) | ||
3116 | bitmap[2] = be32_to_cpup(p); | ||
3117 | } | ||
3022 | } | 3118 | } |
3023 | return 0; | 3119 | return 0; |
3024 | out_overflow: | 3120 | out_overflow: |
@@ -3050,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 | |||
3050 | return ret; | 3146 | return ret; |
3051 | bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; | 3147 | bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; |
3052 | } else | 3148 | } else |
3053 | bitmask[0] = bitmask[1] = 0; | 3149 | bitmask[0] = bitmask[1] = bitmask[2] = 0; |
3054 | dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); | 3150 | dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, |
3151 | bitmask[0], bitmask[1], bitmask[2]); | ||
3055 | return 0; | 3152 | return 0; |
3056 | } | 3153 | } |
3057 | 3154 | ||
@@ -4105,7 +4202,7 @@ out_overflow: | |||
4105 | static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) | 4202 | static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) |
4106 | { | 4203 | { |
4107 | __be32 *savep; | 4204 | __be32 *savep; |
4108 | uint32_t attrlen, bitmap[2] = {0}; | 4205 | uint32_t attrlen, bitmap[3] = {0}; |
4109 | int status; | 4206 | int status; |
4110 | 4207 | ||
4111 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) | 4208 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) |
@@ -4131,7 +4228,7 @@ xdr_error: | |||
4131 | static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) | 4228 | static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) |
4132 | { | 4229 | { |
4133 | __be32 *savep; | 4230 | __be32 *savep; |
4134 | uint32_t attrlen, bitmap[2] = {0}; | 4231 | uint32_t attrlen, bitmap[3] = {0}; |
4135 | int status; | 4232 | int status; |
4136 | 4233 | ||
4137 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) | 4234 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) |
@@ -4163,7 +4260,7 @@ xdr_error: | |||
4163 | static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) | 4260 | static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) |
4164 | { | 4261 | { |
4165 | __be32 *savep; | 4262 | __be32 *savep; |
4166 | uint32_t attrlen, bitmap[2] = {0}; | 4263 | uint32_t attrlen, bitmap[3] = {0}; |
4167 | int status; | 4264 | int status; |
4168 | 4265 | ||
4169 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) | 4266 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) |
@@ -4303,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat | |||
4303 | { | 4400 | { |
4304 | __be32 *savep; | 4401 | __be32 *savep; |
4305 | uint32_t attrlen, | 4402 | uint32_t attrlen, |
4306 | bitmap[2] = {0}; | 4403 | bitmap[3] = {0}; |
4307 | int status; | 4404 | int status; |
4308 | 4405 | ||
4309 | status = decode_op_hdr(xdr, OP_GETATTR); | 4406 | status = decode_op_hdr(xdr, OP_GETATTR); |
@@ -4389,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, | |||
4389 | return status; | 4486 | return status; |
4390 | } | 4487 | } |
4391 | 4488 | ||
4489 | /* | ||
4490 | * The prefered block size for layout directed io | ||
4491 | */ | ||
4492 | static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, | ||
4493 | uint32_t *res) | ||
4494 | { | ||
4495 | __be32 *p; | ||
4496 | |||
4497 | dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); | ||
4498 | *res = 0; | ||
4499 | if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { | ||
4500 | p = xdr_inline_decode(xdr, 4); | ||
4501 | if (unlikely(!p)) { | ||
4502 | print_overflow_msg(__func__, xdr); | ||
4503 | return -EIO; | ||
4504 | } | ||
4505 | *res = be32_to_cpup(p); | ||
4506 | bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; | ||
4507 | } | ||
4508 | return 0; | ||
4509 | } | ||
4510 | |||
4392 | static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) | 4511 | static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) |
4393 | { | 4512 | { |
4394 | __be32 *savep; | 4513 | __be32 *savep; |
4395 | uint32_t attrlen, bitmap[2]; | 4514 | uint32_t attrlen, bitmap[3]; |
4396 | int status; | 4515 | int status; |
4397 | 4516 | ||
4398 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) | 4517 | if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) |
@@ -4420,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) | |||
4420 | status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); | 4539 | status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); |
4421 | if (status != 0) | 4540 | if (status != 0) |
4422 | goto xdr_error; | 4541 | goto xdr_error; |
4542 | status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); | ||
4543 | if (status) | ||
4544 | goto xdr_error; | ||
4423 | 4545 | ||
4424 | status = verify_attr_len(xdr, savep, attrlen); | 4546 | status = verify_attr_len(xdr, savep, attrlen); |
4425 | xdr_error: | 4547 | xdr_error: |
@@ -4839,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, | |||
4839 | { | 4961 | { |
4840 | __be32 *savep; | 4962 | __be32 *savep; |
4841 | uint32_t attrlen, | 4963 | uint32_t attrlen, |
4842 | bitmap[2] = {0}; | 4964 | bitmap[3] = {0}; |
4843 | struct kvec *iov = req->rq_rcv_buf.head; | 4965 | struct kvec *iov = req->rq_rcv_buf.head; |
4844 | int status; | 4966 | int status; |
4845 | 4967 | ||
@@ -5268,6 +5390,53 @@ out_overflow: | |||
5268 | } | 5390 | } |
5269 | 5391 | ||
5270 | #if defined(CONFIG_NFS_V4_1) | 5392 | #if defined(CONFIG_NFS_V4_1) |
5393 | /* | ||
5394 | * TODO: Need to handle case when EOF != true; | ||
5395 | */ | ||
5396 | static int decode_getdevicelist(struct xdr_stream *xdr, | ||
5397 | struct pnfs_devicelist *res) | ||
5398 | { | ||
5399 | __be32 *p; | ||
5400 | int status, i; | ||
5401 | struct nfs_writeverf verftemp; | ||
5402 | |||
5403 | status = decode_op_hdr(xdr, OP_GETDEVICELIST); | ||
5404 | if (status) | ||
5405 | return status; | ||
5406 | |||
5407 | p = xdr_inline_decode(xdr, 8 + 8 + 4); | ||
5408 | if (unlikely(!p)) | ||
5409 | goto out_overflow; | ||
5410 | |||
5411 | /* TODO: Skip cookie for now */ | ||
5412 | p += 2; | ||
5413 | |||
5414 | /* Read verifier */ | ||
5415 | p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); | ||
5416 | |||
5417 | res->num_devs = be32_to_cpup(p); | ||
5418 | |||
5419 | dprintk("%s: num_dev %d\n", __func__, res->num_devs); | ||
5420 | |||
5421 | if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { | ||
5422 | printk(KERN_ERR "%s too many result dev_num %u\n", | ||
5423 | __func__, res->num_devs); | ||
5424 | return -EIO; | ||
5425 | } | ||
5426 | |||
5427 | p = xdr_inline_decode(xdr, | ||
5428 | res->num_devs * NFS4_DEVICEID4_SIZE + 4); | ||
5429 | if (unlikely(!p)) | ||
5430 | goto out_overflow; | ||
5431 | for (i = 0; i < res->num_devs; i++) | ||
5432 | p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, | ||
5433 | NFS4_DEVICEID4_SIZE); | ||
5434 | res->eof = be32_to_cpup(p); | ||
5435 | return 0; | ||
5436 | out_overflow: | ||
5437 | print_overflow_msg(__func__, xdr); | ||
5438 | return -EIO; | ||
5439 | } | ||
5271 | 5440 | ||
5272 | static int decode_getdeviceinfo(struct xdr_stream *xdr, | 5441 | static int decode_getdeviceinfo(struct xdr_stream *xdr, |
5273 | struct pnfs_device *pdev) | 5442 | struct pnfs_device *pdev) |
@@ -5430,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr, | |||
5430 | int status; | 5599 | int status; |
5431 | 5600 | ||
5432 | status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); | 5601 | status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); |
5602 | res->status = status; | ||
5433 | if (status) | 5603 | if (status) |
5434 | return status; | 5604 | return status; |
5435 | 5605 | ||
@@ -6542,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, | |||
6542 | } | 6712 | } |
6543 | 6713 | ||
6544 | /* | 6714 | /* |
6715 | * Decode GETDEVICELIST response | ||
6716 | */ | ||
6717 | static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, | ||
6718 | struct xdr_stream *xdr, | ||
6719 | struct nfs4_getdevicelist_res *res) | ||
6720 | { | ||
6721 | struct compound_hdr hdr; | ||
6722 | int status; | ||
6723 | |||
6724 | dprintk("encoding getdevicelist!\n"); | ||
6725 | |||
6726 | status = decode_compound_hdr(xdr, &hdr); | ||
6727 | if (status != 0) | ||
6728 | goto out; | ||
6729 | status = decode_sequence(xdr, &res->seq_res, rqstp); | ||
6730 | if (status != 0) | ||
6731 | goto out; | ||
6732 | status = decode_putfh(xdr); | ||
6733 | if (status != 0) | ||
6734 | goto out; | ||
6735 | status = decode_getdevicelist(xdr, res->devlist); | ||
6736 | out: | ||
6737 | return status; | ||
6738 | } | ||
6739 | |||
6740 | /* | ||
6545 | * Decode GETDEVINFO response | 6741 | * Decode GETDEVINFO response |
6546 | */ | 6742 | */ |
6547 | static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, | 6743 | static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, |
@@ -6722,7 +6918,7 @@ out: | |||
6722 | int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, | 6918 | int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, |
6723 | int plus) | 6919 | int plus) |
6724 | { | 6920 | { |
6725 | uint32_t bitmap[2] = {0}; | 6921 | uint32_t bitmap[3] = {0}; |
6726 | uint32_t len; | 6922 | uint32_t len; |
6727 | __be32 *p = xdr_inline_decode(xdr, 4); | 6923 | __be32 *p = xdr_inline_decode(xdr, 4); |
6728 | if (unlikely(!p)) | 6924 | if (unlikely(!p)) |
@@ -6908,6 +7104,7 @@ struct rpc_procinfo nfs4_procedures[] = { | |||
6908 | PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), | 7104 | PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), |
6909 | PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), | 7105 | PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), |
6910 | PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), | 7106 | PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), |
7107 | PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), | ||
6911 | #endif /* CONFIG_NFS_V4_1 */ | 7108 | #endif /* CONFIG_NFS_V4_1 */ |
6912 | }; | 7109 | }; |
6913 | 7110 | ||
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 38e5508555c6..e550e8836c37 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c | |||
@@ -76,8 +76,11 @@ find_pnfs_driver(u32 id) | |||
76 | void | 76 | void |
77 | unset_pnfs_layoutdriver(struct nfs_server *nfss) | 77 | unset_pnfs_layoutdriver(struct nfs_server *nfss) |
78 | { | 78 | { |
79 | if (nfss->pnfs_curr_ld) | 79 | if (nfss->pnfs_curr_ld) { |
80 | if (nfss->pnfs_curr_ld->clear_layoutdriver) | ||
81 | nfss->pnfs_curr_ld->clear_layoutdriver(nfss); | ||
80 | module_put(nfss->pnfs_curr_ld->owner); | 82 | module_put(nfss->pnfs_curr_ld->owner); |
83 | } | ||
81 | nfss->pnfs_curr_ld = NULL; | 84 | nfss->pnfs_curr_ld = NULL; |
82 | } | 85 | } |
83 | 86 | ||
@@ -88,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) | |||
88 | * @id layout type. Zero (illegal layout type) indicates pNFS not in use. | 91 | * @id layout type. Zero (illegal layout type) indicates pNFS not in use. |
89 | */ | 92 | */ |
90 | void | 93 | void |
91 | set_pnfs_layoutdriver(struct nfs_server *server, u32 id) | 94 | set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, |
95 | u32 id) | ||
92 | { | 96 | { |
93 | struct pnfs_layoutdriver_type *ld_type = NULL; | 97 | struct pnfs_layoutdriver_type *ld_type = NULL; |
94 | 98 | ||
@@ -115,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) | |||
115 | goto out_no_driver; | 119 | goto out_no_driver; |
116 | } | 120 | } |
117 | server->pnfs_curr_ld = ld_type; | 121 | server->pnfs_curr_ld = ld_type; |
122 | if (ld_type->set_layoutdriver | ||
123 | && ld_type->set_layoutdriver(server, mntfh)) { | ||
124 | printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", | ||
125 | __func__, id); | ||
126 | module_put(ld_type->owner); | ||
127 | goto out_no_driver; | ||
128 | } | ||
118 | 129 | ||
119 | dprintk("%s: pNFS module for %u set\n", __func__, id); | 130 | dprintk("%s: pNFS module for %u set\n", __func__, id); |
120 | return; | 131 | return; |
@@ -190,6 +201,7 @@ static void | |||
190 | pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) | 201 | pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) |
191 | { | 202 | { |
192 | struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; | 203 | struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; |
204 | put_rpccred(lo->plh_lc_cred); | ||
193 | return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); | 205 | return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); |
194 | } | 206 | } |
195 | 207 | ||
@@ -224,6 +236,7 @@ static void | |||
224 | init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) | 236 | init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) |
225 | { | 237 | { |
226 | INIT_LIST_HEAD(&lseg->pls_list); | 238 | INIT_LIST_HEAD(&lseg->pls_list); |
239 | INIT_LIST_HEAD(&lseg->pls_lc_list); | ||
227 | atomic_set(&lseg->pls_refcount, 1); | 240 | atomic_set(&lseg->pls_refcount, 1); |
228 | smp_mb(); | 241 | smp_mb(); |
229 | set_bit(NFS_LSEG_VALID, &lseg->pls_flags); | 242 | set_bit(NFS_LSEG_VALID, &lseg->pls_flags); |
@@ -816,7 +829,9 @@ out: | |||
816 | } | 829 | } |
817 | 830 | ||
818 | static struct pnfs_layout_hdr * | 831 | static struct pnfs_layout_hdr * |
819 | alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) | 832 | alloc_init_layout_hdr(struct inode *ino, |
833 | struct nfs_open_context *ctx, | ||
834 | gfp_t gfp_flags) | ||
820 | { | 835 | { |
821 | struct pnfs_layout_hdr *lo; | 836 | struct pnfs_layout_hdr *lo; |
822 | 837 | ||
@@ -828,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) | |||
828 | INIT_LIST_HEAD(&lo->plh_segs); | 843 | INIT_LIST_HEAD(&lo->plh_segs); |
829 | INIT_LIST_HEAD(&lo->plh_bulk_recall); | 844 | INIT_LIST_HEAD(&lo->plh_bulk_recall); |
830 | lo->plh_inode = ino; | 845 | lo->plh_inode = ino; |
846 | lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); | ||
831 | return lo; | 847 | return lo; |
832 | } | 848 | } |
833 | 849 | ||
834 | static struct pnfs_layout_hdr * | 850 | static struct pnfs_layout_hdr * |
835 | pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) | 851 | pnfs_find_alloc_layout(struct inode *ino, |
852 | struct nfs_open_context *ctx, | ||
853 | gfp_t gfp_flags) | ||
836 | { | 854 | { |
837 | struct nfs_inode *nfsi = NFS_I(ino); | 855 | struct nfs_inode *nfsi = NFS_I(ino); |
838 | struct pnfs_layout_hdr *new = NULL; | 856 | struct pnfs_layout_hdr *new = NULL; |
@@ -847,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) | |||
847 | return nfsi->layout; | 865 | return nfsi->layout; |
848 | } | 866 | } |
849 | spin_unlock(&ino->i_lock); | 867 | spin_unlock(&ino->i_lock); |
850 | new = alloc_init_layout_hdr(ino, gfp_flags); | 868 | new = alloc_init_layout_hdr(ino, ctx, gfp_flags); |
851 | spin_lock(&ino->i_lock); | 869 | spin_lock(&ino->i_lock); |
852 | 870 | ||
853 | if (likely(nfsi->layout == NULL)) /* Won the race? */ | 871 | if (likely(nfsi->layout == NULL)) /* Won the race? */ |
@@ -940,7 +958,7 @@ pnfs_update_layout(struct inode *ino, | |||
940 | if (!pnfs_enabled_sb(NFS_SERVER(ino))) | 958 | if (!pnfs_enabled_sb(NFS_SERVER(ino))) |
941 | return NULL; | 959 | return NULL; |
942 | spin_lock(&ino->i_lock); | 960 | spin_lock(&ino->i_lock); |
943 | lo = pnfs_find_alloc_layout(ino, gfp_flags); | 961 | lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); |
944 | if (lo == NULL) { | 962 | if (lo == NULL) { |
945 | dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); | 963 | dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); |
946 | goto out_unlock; | 964 | goto out_unlock; |
@@ -1350,16 +1368,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) | |||
1350 | EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); | 1368 | EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); |
1351 | 1369 | ||
1352 | /* | 1370 | /* |
1353 | * Currently there is only one (whole file) write lseg. | 1371 | * There can be multiple RW segments. |
1354 | */ | 1372 | */ |
1355 | static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) | 1373 | static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) |
1356 | { | 1374 | { |
1357 | struct pnfs_layout_segment *lseg, *rv = NULL; | 1375 | struct pnfs_layout_segment *lseg; |
1358 | 1376 | ||
1359 | list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) | 1377 | list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { |
1360 | if (lseg->pls_range.iomode == IOMODE_RW) | 1378 | if (lseg->pls_range.iomode == IOMODE_RW && |
1361 | rv = lseg; | 1379 | test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) |
1362 | return rv; | 1380 | list_add(&lseg->pls_lc_list, listp); |
1381 | } | ||
1363 | } | 1382 | } |
1364 | 1383 | ||
1365 | void | 1384 | void |
@@ -1371,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) | |||
1371 | 1390 | ||
1372 | spin_lock(&nfsi->vfs_inode.i_lock); | 1391 | spin_lock(&nfsi->vfs_inode.i_lock); |
1373 | if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { | 1392 | if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { |
1374 | /* references matched in nfs4_layoutcommit_release */ | ||
1375 | get_lseg(wdata->lseg); | ||
1376 | wdata->lseg->pls_lc_cred = | ||
1377 | get_rpccred(wdata->args.context->state->owner->so_cred); | ||
1378 | mark_as_dirty = true; | 1393 | mark_as_dirty = true; |
1379 | dprintk("%s: Set layoutcommit for inode %lu ", | 1394 | dprintk("%s: Set layoutcommit for inode %lu ", |
1380 | __func__, wdata->inode->i_ino); | 1395 | __func__, wdata->inode->i_ino); |
1381 | } | 1396 | } |
1382 | if (end_pos > wdata->lseg->pls_end_pos) | 1397 | if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { |
1383 | wdata->lseg->pls_end_pos = end_pos; | 1398 | /* references matched in nfs4_layoutcommit_release */ |
1399 | get_lseg(wdata->lseg); | ||
1400 | } | ||
1401 | if (end_pos > nfsi->layout->plh_lwb) | ||
1402 | nfsi->layout->plh_lwb = end_pos; | ||
1384 | spin_unlock(&nfsi->vfs_inode.i_lock); | 1403 | spin_unlock(&nfsi->vfs_inode.i_lock); |
1404 | dprintk("%s: lseg %p end_pos %llu\n", | ||
1405 | __func__, wdata->lseg, nfsi->layout->plh_lwb); | ||
1385 | 1406 | ||
1386 | /* if pnfs_layoutcommit_inode() runs between inode locks, the next one | 1407 | /* if pnfs_layoutcommit_inode() runs between inode locks, the next one |
1387 | * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ | 1408 | * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ |
@@ -1390,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) | |||
1390 | } | 1411 | } |
1391 | EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); | 1412 | EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); |
1392 | 1413 | ||
1414 | void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) | ||
1415 | { | ||
1416 | struct nfs_server *nfss = NFS_SERVER(data->args.inode); | ||
1417 | |||
1418 | if (nfss->pnfs_curr_ld->cleanup_layoutcommit) | ||
1419 | nfss->pnfs_curr_ld->cleanup_layoutcommit(data); | ||
1420 | } | ||
1421 | |||
1393 | /* | 1422 | /* |
1394 | * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and | 1423 | * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and |
1395 | * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough | 1424 | * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough |
@@ -1403,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) | |||
1403 | { | 1432 | { |
1404 | struct nfs4_layoutcommit_data *data; | 1433 | struct nfs4_layoutcommit_data *data; |
1405 | struct nfs_inode *nfsi = NFS_I(inode); | 1434 | struct nfs_inode *nfsi = NFS_I(inode); |
1406 | struct pnfs_layout_segment *lseg; | ||
1407 | struct rpc_cred *cred; | ||
1408 | loff_t end_pos; | 1435 | loff_t end_pos; |
1409 | int status = 0; | 1436 | int status = 0; |
1410 | 1437 | ||
@@ -1421,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) | |||
1421 | goto out; | 1448 | goto out; |
1422 | } | 1449 | } |
1423 | 1450 | ||
1451 | INIT_LIST_HEAD(&data->lseg_list); | ||
1424 | spin_lock(&inode->i_lock); | 1452 | spin_lock(&inode->i_lock); |
1425 | if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { | 1453 | if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { |
1426 | spin_unlock(&inode->i_lock); | 1454 | spin_unlock(&inode->i_lock); |
1427 | kfree(data); | 1455 | kfree(data); |
1428 | goto out; | 1456 | goto out; |
1429 | } | 1457 | } |
1430 | /* | ||
1431 | * Currently only one (whole file) write lseg which is referenced | ||
1432 | * in pnfs_set_layoutcommit and will be found. | ||
1433 | */ | ||
1434 | lseg = pnfs_list_write_lseg(inode); | ||
1435 | 1458 | ||
1436 | end_pos = lseg->pls_end_pos; | 1459 | pnfs_list_write_lseg(inode, &data->lseg_list); |
1437 | cred = lseg->pls_lc_cred; | 1460 | |
1438 | lseg->pls_end_pos = 0; | 1461 | end_pos = nfsi->layout->plh_lwb; |
1439 | lseg->pls_lc_cred = NULL; | 1462 | nfsi->layout->plh_lwb = 0; |
1440 | 1463 | ||
1441 | memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, | 1464 | memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, |
1442 | sizeof(nfsi->layout->plh_stateid.data)); | 1465 | sizeof(nfsi->layout->plh_stateid.data)); |
1443 | spin_unlock(&inode->i_lock); | 1466 | spin_unlock(&inode->i_lock); |
1444 | 1467 | ||
1445 | data->args.inode = inode; | 1468 | data->args.inode = inode; |
1446 | data->lseg = lseg; | 1469 | data->cred = get_rpccred(nfsi->layout->plh_lc_cred); |
1447 | data->cred = cred; | ||
1448 | nfs_fattr_init(&data->fattr); | 1470 | nfs_fattr_init(&data->fattr); |
1449 | data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; | 1471 | data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; |
1450 | data->res.fattr = &data->fattr; | 1472 | data->res.fattr = &data->fattr; |
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 078670dfbe04..e0b5d80a43f6 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h | |||
@@ -36,16 +36,16 @@ | |||
36 | enum { | 36 | enum { |
37 | NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ | 37 | NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ |
38 | NFS_LSEG_ROC, /* roc bit received from server */ | 38 | NFS_LSEG_ROC, /* roc bit received from server */ |
39 | NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ | ||
39 | }; | 40 | }; |
40 | 41 | ||
41 | struct pnfs_layout_segment { | 42 | struct pnfs_layout_segment { |
42 | struct list_head pls_list; | 43 | struct list_head pls_list; |
44 | struct list_head pls_lc_list; | ||
43 | struct pnfs_layout_range pls_range; | 45 | struct pnfs_layout_range pls_range; |
44 | atomic_t pls_refcount; | 46 | atomic_t pls_refcount; |
45 | unsigned long pls_flags; | 47 | unsigned long pls_flags; |
46 | struct pnfs_layout_hdr *pls_layout; | 48 | struct pnfs_layout_hdr *pls_layout; |
47 | struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ | ||
48 | loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ | ||
49 | }; | 49 | }; |
50 | 50 | ||
51 | enum pnfs_try_status { | 51 | enum pnfs_try_status { |
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type { | |||
80 | struct module *owner; | 80 | struct module *owner; |
81 | unsigned flags; | 81 | unsigned flags; |
82 | 82 | ||
83 | int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); | ||
84 | int (*clear_layoutdriver) (struct nfs_server *); | ||
85 | |||
83 | struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); | 86 | struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); |
84 | void (*free_layout_hdr) (struct pnfs_layout_hdr *); | 87 | void (*free_layout_hdr) (struct pnfs_layout_hdr *); |
85 | 88 | ||
@@ -110,6 +113,8 @@ struct pnfs_layoutdriver_type { | |||
110 | struct xdr_stream *xdr, | 113 | struct xdr_stream *xdr, |
111 | const struct nfs4_layoutreturn_args *args); | 114 | const struct nfs4_layoutreturn_args *args); |
112 | 115 | ||
116 | void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); | ||
117 | |||
113 | void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, | 118 | void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, |
114 | struct xdr_stream *xdr, | 119 | struct xdr_stream *xdr, |
115 | const struct nfs4_layoutcommit_args *args); | 120 | const struct nfs4_layoutcommit_args *args); |
@@ -125,6 +130,8 @@ struct pnfs_layout_hdr { | |||
125 | unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ | 130 | unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ |
126 | u32 plh_barrier; /* ignore lower seqids */ | 131 | u32 plh_barrier; /* ignore lower seqids */ |
127 | unsigned long plh_flags; | 132 | unsigned long plh_flags; |
133 | loff_t plh_lwb; /* last write byte for layoutcommit */ | ||
134 | struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ | ||
128 | struct inode *plh_inode; | 135 | struct inode *plh_inode; |
129 | }; | 136 | }; |
130 | 137 | ||
@@ -137,10 +144,21 @@ struct pnfs_device { | |||
137 | unsigned int pglen; | 144 | unsigned int pglen; |
138 | }; | 145 | }; |
139 | 146 | ||
147 | #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 | ||
148 | |||
149 | struct pnfs_devicelist { | ||
150 | unsigned int eof; | ||
151 | unsigned int num_devs; | ||
152 | struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; | ||
153 | }; | ||
154 | |||
140 | extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); | 155 | extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); |
141 | extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); | 156 | extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); |
142 | 157 | ||
143 | /* nfs4proc.c */ | 158 | /* nfs4proc.c */ |
159 | extern int nfs4_proc_getdevicelist(struct nfs_server *server, | ||
160 | const struct nfs_fh *fh, | ||
161 | struct pnfs_devicelist *devlist); | ||
144 | extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, | 162 | extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, |
145 | struct pnfs_device *dev); | 163 | struct pnfs_device *dev); |
146 | extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); | 164 | extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); |
@@ -153,7 +171,7 @@ void put_lseg(struct pnfs_layout_segment *lseg); | |||
153 | bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); | 171 | bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); |
154 | bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); | 172 | bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); |
155 | 173 | ||
156 | void set_pnfs_layoutdriver(struct nfs_server *, u32 id); | 174 | void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); |
157 | void unset_pnfs_layoutdriver(struct nfs_server *); | 175 | void unset_pnfs_layoutdriver(struct nfs_server *); |
158 | void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); | 176 | void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); |
159 | int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); | 177 | int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); |
@@ -179,6 +197,7 @@ void pnfs_roc_release(struct inode *ino); | |||
179 | void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); | 197 | void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); |
180 | bool pnfs_roc_drain(struct inode *ino, u32 *barrier); | 198 | bool pnfs_roc_drain(struct inode *ino, u32 *barrier); |
181 | void pnfs_set_layoutcommit(struct nfs_write_data *wdata); | 199 | void pnfs_set_layoutcommit(struct nfs_write_data *wdata); |
200 | void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); | ||
182 | int pnfs_layoutcommit_inode(struct inode *inode, bool sync); | 201 | int pnfs_layoutcommit_inode(struct inode *inode, bool sync); |
183 | int _pnfs_return_layout(struct inode *); | 202 | int _pnfs_return_layout(struct inode *); |
184 | int pnfs_ld_write_done(struct nfs_write_data *); | 203 | int pnfs_ld_write_done(struct nfs_write_data *); |
@@ -360,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier) | |||
360 | return false; | 379 | return false; |
361 | } | 380 | } |
362 | 381 | ||
363 | static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) | 382 | static inline void set_pnfs_layoutdriver(struct nfs_server *s, |
383 | const struct nfs_fh *mntfh, u32 id); | ||
364 | { | 384 | { |
365 | } | 385 | } |
366 | 386 | ||
diff --git a/include/linux/nfs.h b/include/linux/nfs.h index f387919bbc59..8c6ee44914cb 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h | |||
@@ -29,6 +29,8 @@ | |||
29 | #define NFS_MNT_VERSION 1 | 29 | #define NFS_MNT_VERSION 1 |
30 | #define NFS_MNT3_VERSION 3 | 30 | #define NFS_MNT3_VERSION 3 |
31 | 31 | ||
32 | #define NFS_PIPE_DIRNAME "/nfs" | ||
33 | |||
32 | /* | 34 | /* |
33 | * NFS stats. The good thing with these values is that NFSv3 errors are | 35 | * NFS stats. The good thing with these values is that NFSv3 errors are |
34 | * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which | 36 | * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which |
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index a3c4bc800dce..76f99e8714f3 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h | |||
@@ -566,6 +566,7 @@ enum { | |||
566 | NFSPROC4_CLNT_SECINFO_NO_NAME, | 566 | NFSPROC4_CLNT_SECINFO_NO_NAME, |
567 | NFSPROC4_CLNT_TEST_STATEID, | 567 | NFSPROC4_CLNT_TEST_STATEID, |
568 | NFSPROC4_CLNT_FREE_STATEID, | 568 | NFSPROC4_CLNT_FREE_STATEID, |
569 | NFSPROC4_CLNT_GETDEVICELIST, | ||
569 | }; | 570 | }; |
570 | 571 | ||
571 | /* nfs41 types */ | 572 | /* nfs41 types */ |
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8b579beb6358..b96fb99072ff 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h | |||
@@ -99,9 +99,10 @@ struct nfs_open_context { | |||
99 | 99 | ||
100 | struct nfs_open_dir_context { | 100 | struct nfs_open_dir_context { |
101 | struct rpc_cred *cred; | 101 | struct rpc_cred *cred; |
102 | unsigned long attr_gencount; | ||
102 | __u64 dir_cookie; | 103 | __u64 dir_cookie; |
103 | __u64 dup_cookie; | 104 | __u64 dup_cookie; |
104 | int duped; | 105 | signed char duped; |
105 | }; | 106 | }; |
106 | 107 | ||
107 | /* | 108 | /* |
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 50a661f8b45a..82fdfc7987d6 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h | |||
@@ -132,7 +132,7 @@ struct nfs_server { | |||
132 | #endif | 132 | #endif |
133 | 133 | ||
134 | #ifdef CONFIG_NFS_V4 | 134 | #ifdef CONFIG_NFS_V4 |
135 | u32 attr_bitmask[2];/* V4 bitmask representing the set | 135 | u32 attr_bitmask[3];/* V4 bitmask representing the set |
136 | of attributes supported on this | 136 | of attributes supported on this |
137 | filesystem */ | 137 | filesystem */ |
138 | u32 cache_consistency_bitmask[2]; | 138 | u32 cache_consistency_bitmask[2]; |
@@ -145,6 +145,8 @@ struct nfs_server { | |||
145 | filesystem */ | 145 | filesystem */ |
146 | struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ | 146 | struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ |
147 | struct rpc_wait_queue roc_rpcwaitq; | 147 | struct rpc_wait_queue roc_rpcwaitq; |
148 | u32 pnfs_blksize; /* layout_blksize attr */ | ||
149 | void *pnfs_ld_data; /* per mount point data */ | ||
148 | 150 | ||
149 | /* the following fields are protected by nfs_client->cl_lock */ | 151 | /* the following fields are protected by nfs_client->cl_lock */ |
150 | struct rb_root state_owners; | 152 | struct rb_root state_owners; |
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 5b115956abac..569ea5b76fda 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h | |||
@@ -122,6 +122,7 @@ struct nfs_fsinfo { | |||
122 | struct timespec time_delta; /* server time granularity */ | 122 | struct timespec time_delta; /* server time granularity */ |
123 | __u32 lease_time; /* in seconds */ | 123 | __u32 lease_time; /* in seconds */ |
124 | __u32 layouttype; /* supported pnfs layout driver */ | 124 | __u32 layouttype; /* supported pnfs layout driver */ |
125 | __u32 blksize; /* preferred pnfs io block size */ | ||
125 | }; | 126 | }; |
126 | 127 | ||
127 | struct nfs_fsstat { | 128 | struct nfs_fsstat { |
@@ -235,6 +236,17 @@ struct nfs4_layoutget { | |||
235 | gfp_t gfp_flags; | 236 | gfp_t gfp_flags; |
236 | }; | 237 | }; |
237 | 238 | ||
239 | struct nfs4_getdevicelist_args { | ||
240 | const struct nfs_fh *fh; | ||
241 | u32 layoutclass; | ||
242 | struct nfs4_sequence_args seq_args; | ||
243 | }; | ||
244 | |||
245 | struct nfs4_getdevicelist_res { | ||
246 | struct pnfs_devicelist *devlist; | ||
247 | struct nfs4_sequence_res seq_res; | ||
248 | }; | ||
249 | |||
238 | struct nfs4_getdeviceinfo_args { | 250 | struct nfs4_getdeviceinfo_args { |
239 | struct pnfs_device *pdev; | 251 | struct pnfs_device *pdev; |
240 | struct nfs4_sequence_args seq_args; | 252 | struct nfs4_sequence_args seq_args; |
@@ -257,12 +269,13 @@ struct nfs4_layoutcommit_res { | |||
257 | struct nfs_fattr *fattr; | 269 | struct nfs_fattr *fattr; |
258 | const struct nfs_server *server; | 270 | const struct nfs_server *server; |
259 | struct nfs4_sequence_res seq_res; | 271 | struct nfs4_sequence_res seq_res; |
272 | int status; | ||
260 | }; | 273 | }; |
261 | 274 | ||
262 | struct nfs4_layoutcommit_data { | 275 | struct nfs4_layoutcommit_data { |
263 | struct rpc_task task; | 276 | struct rpc_task task; |
264 | struct nfs_fattr fattr; | 277 | struct nfs_fattr fattr; |
265 | struct pnfs_layout_segment *lseg; | 278 | struct list_head lseg_list; |
266 | struct rpc_cred *cred; | 279 | struct rpc_cred *cred; |
267 | struct nfs4_layoutcommit_args args; | 280 | struct nfs4_layoutcommit_args args; |
268 | struct nfs4_layoutcommit_res res; | 281 | struct nfs4_layoutcommit_res res; |
@@ -943,7 +956,7 @@ struct nfs4_server_caps_arg { | |||
943 | }; | 956 | }; |
944 | 957 | ||
945 | struct nfs4_server_caps_res { | 958 | struct nfs4_server_caps_res { |
946 | u32 attr_bitmask[2]; | 959 | u32 attr_bitmask[3]; |
947 | u32 acl_bitmask; | 960 | u32 acl_bitmask; |
948 | u32 has_links; | 961 | u32 has_links; |
949 | u32 has_symlinks; | 962 | u32 has_symlinks; |