aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/blocklayout
diff options
context:
space:
mode:
authorFred Isaman <iisaman@citi.umich.edu>2011-07-30 20:52:53 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2011-07-31 12:18:17 -0400
commit9549ec01b0dcf1c1eb277cba60067236b3f48508 (patch)
tree5f6aaeaafd088a13cf8369a11ac37367a44c2cde /fs/nfs/blocklayout
parentb2be7811dd94816f3df76708c8eb7f55bf7289e2 (diff)
pnfsblock: bl_read_pagelist
Note: When upper layer's read/write request cannot be fulfilled, the block layout driver shouldn't silently mark the page as error. It should do what can be done and leave the rest to the upper layer. To do so, we should set rdata/wdata->res.count properly. When upper layer re-send the read/write request to finish the rest part of the request, pgbase is the position where we should start at. [pnfsblock: mark IO error with NFS_LAYOUT_{RW|RO}_FAILED] Signed-off-by: Peng Tao <peng_tao@emc.com> [pnfsblock: read path error handling] Signed-off-by: Fred Isaman <iisaman@citi.umich.edu> [pnfsblock: handle errors when read or write pagelist.] Signed-off-by: Zhang Jingwang <yyalone@gmail.com> [pnfs-block: use new read_pagelist api] Signed-off-by: Benny Halevy <bhalevy@panasas.com> Signed-off-by: Benny Halevy <bhalevy@tonian.com> Signed-off-by: Jim Rees <rees@umich.edu> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs/blocklayout')
-rw-r--r--fs/nfs/blocklayout/blocklayout.c265
1 files changed, 265 insertions, 0 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6c1bafb8920b..facb5ba21204 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -29,10 +29,12 @@
29 * of the software, even if it has been or is hereafter advised of the 29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages. 30 * possibility of such damages.
31 */ 31 */
32
32#include <linux/module.h> 33#include <linux/module.h>
33#include <linux/init.h> 34#include <linux/init.h>
34#include <linux/mount.h> 35#include <linux/mount.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */
36 38
37#include "blocklayout.h" 39#include "blocklayout.h"
38 40
@@ -45,9 +47,272 @@ MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
45struct dentry *bl_device_pipe; 47struct dentry *bl_device_pipe;
46wait_queue_head_t bl_wq; 48wait_queue_head_t bl_wq;
47 49
50static void print_page(struct page *page)
51{
52 dprintk("PRINTPAGE page %p\n", page);
53 dprintk(" PagePrivate %d\n", PagePrivate(page));
54 dprintk(" PageUptodate %d\n", PageUptodate(page));
55 dprintk(" PageError %d\n", PageError(page));
56 dprintk(" PageDirty %d\n", PageDirty(page));
57 dprintk(" PageReferenced %d\n", PageReferenced(page));
58 dprintk(" PageLocked %d\n", PageLocked(page));
59 dprintk(" PageWriteback %d\n", PageWriteback(page));
60 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
61 dprintk("\n");
62}
63
64/* Given the be associated with isect, determine if page data needs to be
65 * initialized.
66 */
67static int is_hole(struct pnfs_block_extent *be, sector_t isect)
68{
69 if (be->be_state == PNFS_BLOCK_NONE_DATA)
70 return 1;
71 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
72 return 0;
73 else
74 return !bl_is_sector_init(be->be_inval, isect);
75}
76
77/* The data we are handed might be spread across several bios. We need
78 * to track when the last one is finished.
79 */
80struct parallel_io {
81 struct kref refcnt;
82 struct rpc_call_ops call_ops;
83 void (*pnfs_callback) (void *data);
84 void *data;
85};
86
87static inline struct parallel_io *alloc_parallel(void *data)
88{
89 struct parallel_io *rv;
90
91 rv = kmalloc(sizeof(*rv), GFP_NOFS);
92 if (rv) {
93 rv->data = data;
94 kref_init(&rv->refcnt);
95 }
96 return rv;
97}
98
99static inline void get_parallel(struct parallel_io *p)
100{
101 kref_get(&p->refcnt);
102}
103
104static void destroy_parallel(struct kref *kref)
105{
106 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
107
108 dprintk("%s enter\n", __func__);
109 p->pnfs_callback(p->data);
110 kfree(p);
111}
112
113static inline void put_parallel(struct parallel_io *p)
114{
115 kref_put(&p->refcnt, destroy_parallel);
116}
117
118static struct bio *
119bl_submit_bio(int rw, struct bio *bio)
120{
121 if (bio) {
122 get_parallel(bio->bi_private);
123 dprintk("%s submitting %s bio %u@%llu\n", __func__,
124 rw == READ ? "read" : "write",
125 bio->bi_size, (unsigned long long)bio->bi_sector);
126 submit_bio(rw, bio);
127 }
128 return NULL;
129}
130
131static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
132 struct pnfs_block_extent *be,
133 void (*end_io)(struct bio *, int err),
134 struct parallel_io *par)
135{
136 struct bio *bio;
137
138 bio = bio_alloc(GFP_NOIO, npg);
139 if (!bio)
140 return NULL;
141
142 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
143 bio->bi_bdev = be->be_mdev;
144 bio->bi_end_io = end_io;
145 bio->bi_private = par;
146 return bio;
147}
148
149static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
150 sector_t isect, struct page *page,
151 struct pnfs_block_extent *be,
152 void (*end_io)(struct bio *, int err),
153 struct parallel_io *par)
154{
155retry:
156 if (!bio) {
157 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
158 if (!bio)
159 return ERR_PTR(-ENOMEM);
160 }
161 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
162 bio = bl_submit_bio(rw, bio);
163 goto retry;
164 }
165 return bio;
166}
167
168static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
169{
170 if (lseg->pls_range.iomode == IOMODE_RW) {
171 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
172 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
173 } else {
174 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
175 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
176 }
177}
178
179/* This is basically copied from mpage_end_io_read */
180static void bl_end_io_read(struct bio *bio, int err)
181{
182 struct parallel_io *par = bio->bi_private;
183 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
184 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
185 struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
186
187 do {
188 struct page *page = bvec->bv_page;
189
190 if (--bvec >= bio->bi_io_vec)
191 prefetchw(&bvec->bv_page->flags);
192 if (uptodate)
193 SetPageUptodate(page);
194 } while (bvec >= bio->bi_io_vec);
195 if (!uptodate) {
196 if (!rdata->pnfs_error)
197 rdata->pnfs_error = -EIO;
198 bl_set_lo_fail(rdata->lseg);
199 }
200 bio_put(bio);
201 put_parallel(par);
202}
203
204static void bl_read_cleanup(struct work_struct *work)
205{
206 struct rpc_task *task;
207 struct nfs_read_data *rdata;
208 dprintk("%s enter\n", __func__);
209 task = container_of(work, struct rpc_task, u.tk_work);
210 rdata = container_of(task, struct nfs_read_data, task);
211 pnfs_ld_read_done(rdata);
212}
213
214static void
215bl_end_par_io_read(void *data)
216{
217 struct nfs_read_data *rdata = data;
218
219 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
220 schedule_work(&rdata->task.u.tk_work);
221}
222
223/* We don't want normal .rpc_call_done callback used, so we replace it
224 * with this stub.
225 */
226static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
227{
228 return;
229}
230
48static enum pnfs_try_status 231static enum pnfs_try_status
49bl_read_pagelist(struct nfs_read_data *rdata) 232bl_read_pagelist(struct nfs_read_data *rdata)
50{ 233{
234 int i, hole;
235 struct bio *bio = NULL;
236 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
237 sector_t isect, extent_length = 0;
238 struct parallel_io *par;
239 loff_t f_offset = rdata->args.offset;
240 size_t count = rdata->args.count;
241 struct page **pages = rdata->args.pages;
242 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
243
244 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
245 rdata->npages, f_offset, count);
246
247 par = alloc_parallel(rdata);
248 if (!par)
249 goto use_mds;
250 par->call_ops = *rdata->mds_ops;
251 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
252 par->pnfs_callback = bl_end_par_io_read;
253 /* At this point, we can no longer jump to use_mds */
254
255 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
256 /* Code assumes extents are page-aligned */
257 for (i = pg_index; i < rdata->npages; i++) {
258 if (!extent_length) {
259 /* We've used up the previous extent */
260 bl_put_extent(be);
261 bl_put_extent(cow_read);
262 bio = bl_submit_bio(READ, bio);
263 /* Get the next one */
264 be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
265 isect, &cow_read);
266 if (!be) {
267 rdata->pnfs_error = -EIO;
268 goto out;
269 }
270 extent_length = be->be_length -
271 (isect - be->be_f_offset);
272 if (cow_read) {
273 sector_t cow_length = cow_read->be_length -
274 (isect - cow_read->be_f_offset);
275 extent_length = min(extent_length, cow_length);
276 }
277 }
278 hole = is_hole(be, isect);
279 if (hole && !cow_read) {
280 bio = bl_submit_bio(READ, bio);
281 /* Fill hole w/ zeroes w/o accessing device */
282 dprintk("%s Zeroing page for hole\n", __func__);
283 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
284 print_page(pages[i]);
285 SetPageUptodate(pages[i]);
286 } else {
287 struct pnfs_block_extent *be_read;
288
289 be_read = (hole && cow_read) ? cow_read : be;
290 bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
291 isect, pages[i], be_read,
292 bl_end_io_read, par);
293 if (IS_ERR(bio)) {
294 rdata->pnfs_error = PTR_ERR(bio);
295 goto out;
296 }
297 }
298 isect += PAGE_CACHE_SECTORS;
299 extent_length -= PAGE_CACHE_SECTORS;
300 }
301 if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
302 rdata->res.eof = 1;
303 rdata->res.count = rdata->inode->i_size - f_offset;
304 } else {
305 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
306 }
307out:
308 bl_put_extent(be);
309 bl_put_extent(cow_read);
310 bl_submit_bio(READ, bio);
311 put_parallel(par);
312 return PNFS_ATTEMPTED;
313
314 use_mds:
315 dprintk("Giving up and using normal NFS\n");
51 return PNFS_NOT_ATTEMPTED; 316 return PNFS_NOT_ATTEMPTED;
52} 317}
53 318