From d2224e7afbf2a6556f4f8f25bc0e96d99ec4d2bd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 6 Mar 2011 17:14:13 +0000 Subject: nfs: close NFSv4 COMMIT vs. CLOSE race I've been adding in more artificial delays in the NFSv4 commit and close codepaths to uncover races. The kernel I'm testing has the patch to close the race in __rpc_wait_for_completion_task that's in Trond's cthon2011 branch. The reproducer I've been using does this in a loop: mkdir("DIR"); fd = open("DIR/FILE", O_WRONLY|O_CREAT|O_EXCL, 0644); write(fd, "abcdefg", 7); close(fd); unlink("DIR/FILE"); rmdir("DIR"); The above reproducer shouldn't result in any silly-renaming. However, when I add a "msleep(100)" just after the nfs_commit_clear_lock call in nfs_commit_release, I can almost always force one to occur. If I can force it to occur with that, then it can happen without that delay given the right timing. nfs_commit_inode waits for the NFS_INO_COMMIT bit to clear when called with FLUSH_SYNC set. nfs_commit_rpcsetup on the other hand does not wait for the task to complete before putting its reference to it, so the last reference get put in rpc_release task and gets queued to a workqueue. In this situation, the last open context reference may be put by the COMMIT release instead of the close() syscall. The close() syscall returns too quickly and the unlink runs while the d_count is still high since the COMMIT release hasn't put its dentry reference yet. Fix this by having rpc_commit_rpcsetup wait for the RPC call to complete before putting the task reference when FLUSH_SYNC is set. With this, the last reference is put by the process that's initiating the FLUSH_SYNC commit and the race is closed. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c8278f4046cb..42b92d7a9cc4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } -- cgit v1.2.2 From 136028967a283929c6f01518d0700b73fa622d56 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:38 +0000 Subject: NFS: change nfs_writeback_done to return void The return values are not used by any callers. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 42b92d7a9cc4..ae528b98b804 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1132,7 +1132,7 @@ static const struct rpc_call_ops nfs_write_full_ops = { /* * This function is called when the WRITE call is complete. */ -int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; @@ -1151,7 +1151,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(data->inode)->write_done(task, data); if (status != 0) - return status; + return; nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -1196,7 +1196,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) argp->stable = NFS_FILE_SYNC; } nfs_restart_rpc(task, server->nfs_client); - return -EAGAIN; + return; } if (time_before(complain, jiffies)) { printk(KERN_WARNING @@ -1207,7 +1207,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - return 0; + return; } -- cgit v1.2.2 From 94ad1c80e28f9700c84b4d28d1e5302ddf63a6fd Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:14 +0000 Subject: NFSv4.1: coelesce across layout stripes Add a pg_test layout driver hook which is used to avoid coelescing I/O across layout stripes. Signed-off-by: Andy Adamson Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Boaz Harrosh Signed-off-by: Oleg Drokin Signed-off-by: Tao Guo Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ae528b98b804..40143c4747a5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -28,6 +28,7 @@ #include "iostat.h" #include "nfs4_fs.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -982,6 +983,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; + pgio->pg_test = NULL; + if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else -- cgit v1.2.2 From bae724ef95b0d0a1f4518f5451e7c8aabc41f820 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:15 +0000 Subject: NFSv4.1: shift pnfs_update_layout locations Move the pnfs_update_layout call location to nfs_pageio_do_add_request(). Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach it to each nfs_read_data so it can be sent to the layout driver. Signed-off-by: Andy Adamson Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Boaz Harrosh Signed-off-by: Oleg Drokin Signed-off-by: Tao Guo Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 40143c4747a5..f033fa0d7d33 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -880,7 +880,7 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) { struct nfs_page *req = nfs_list_entry(head->next); struct page *page = req->wb_page; @@ -947,7 +947,7 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) { struct nfs_page *req; struct page **pages; -- cgit v1.2.2 From d138d5d17be6a60d883e8bd4e22bc218d3adfab3 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 3 Mar 2011 15:13:41 +0000 Subject: NFSv4.1: rearrange nfs_write_rpcsetup Reorder nfs_write_rpcsetup, preparing for a pnfs entry point. Signed-off-by: Andy Adamson Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 82 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 36 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f033fa0d7d33..ae035990941a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -782,25 +782,21 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - int how) +static int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .task = &data->task, .rpc_message = &msg, .callback_ops = call_ops, @@ -811,12 +807,49 @@ static int nfs_write_rpcsetup(struct nfs_page *req, }; int ret = 0; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->write_setup(data, &msg); + + dprintk("NFS: %5u initiated write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + int how) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -837,30 +870,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - - dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } /* If a nfs_flush_* function fails, it should remove reqs from @head and -- cgit v1.2.2 From 5053aa568d4017aeb1fa35247d4ad96be262920f Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:43 +0000 Subject: NFSv4.1: Send lseg down into nfs_write_rpcsetup We grab the lseg sent in from the doio function and attach it to each struct nfs_write_data created. This is how the lseg will be sent to the layout driver. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ae035990941a..72b0ec0bb0e1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -97,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p) static void nfs_writedata_release(struct nfs_write_data *wdata) { + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -840,6 +841,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg, int how) { struct inode *inode = req->wb_context->path.dentry->d_inode; @@ -850,6 +852,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -930,7 +933,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, how); + wsize, offset, lseg, how); if (ret == 0) ret = ret2; offset += wsize; @@ -978,7 +981,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i req = nfs_list_entry(data->pages.next); /* Set up the argument struct */ - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); + return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); -- cgit v1.2.2 From 44b83799a922a153957c65ccfc985a8c902958c8 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:44 +0000 Subject: NFSv4.1: trigger LAYOUTGET for writes Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 72b0ec0bb0e1..49c4784c24e5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned } while (nbytes != 0); atomic_set(&req->wb_complete, requests); + BUG_ON(lseg); + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); ClearPageError(page); offset = 0; nbytes = count; @@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned nbytes -= wsize; } while (nbytes != 0); + put_lseg(lseg); return ret; out_bad: @@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i struct nfs_page *req; struct page **pages; struct nfs_write_data *data; + int ret; data = nfs_writedata_alloc(npages); - if (!data) - goto out_bad; - + if (!data) { + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } + ret = -ENOMEM; + goto out; + } pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); /* Set up the argument struct */ - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); - out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_redirty_request(req); - } - return -ENOMEM; + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); +out: + put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + return ret; } static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -996,7 +1004,7 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; - pgio->pg_test = NULL; + pnfs_pageio_init_write(pgio, inode); if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); -- cgit v1.2.2 From 0382b74409c6b9ef12c952b50bb44f557a361a43 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 3 Mar 2011 15:13:45 +0000 Subject: NFSv4.1: implement generic pnfs layer write switch Signed-off-by: Andy Adamson Signed-off-by: Boaz Harrosh Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: J. Bruce Fields Signed-off-by: Mike Sager Signed-off-by: Ricardo Labiaga Signed-off-by: Tao Guo Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 49c4784c24e5..df99c5b0ee65 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -873,6 +873,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); + if (data->lseg && + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) + return 0; + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } -- cgit v1.2.2 From a69aef1496726ed88386dad65abfcc8cd3195304 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:47 +0000 Subject: NFSv4.1: pnfs filelayout driver write Allows the pnfs filelayout driver to write to the data servers. Note that COMMIT to data servers will be implemented in a future patch. To avoid improper behavior, for the moment any WRITE to a data server that would also require a COMMIT to the data server is sent NFS_FILE_SYNC. Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Mingyang Guo Signed-off-by: Oleg Drokin Signed-off-by: Ricardo Labiaga Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index df99c5b0ee65..ee62ddf60e7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -783,7 +783,7 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -static int nfs_initiate_write(struct nfs_write_data *data, +int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how) @@ -833,6 +833,7 @@ static int nfs_initiate_write(struct nfs_write_data *data, out: return ret; } +EXPORT_SYMBOL_GPL(nfs_initiate_write); /* * Set up the argument/result storage required for the RPC call. @@ -1194,6 +1195,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ static unsigned long complain; + /* Note this will print the MDS for a DS write */ if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", @@ -1214,6 +1216,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Was this an NFSv2 write or an NFSv3 stable write? */ if (resp->verf->committed != NFS_UNSTABLE) { /* Resend from where the server left off */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; -- cgit v1.2.2 From c76069bda0f17cd3e153e54d9ac01242909c6b15 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:48 +0000 Subject: NFSv4.1: rearrange ->doio args This will make it possible to clear the lseg pointer in the same function as it is put, instead of in the caller nfs_pageio_doio(). Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ee62ddf60e7a..b74200a2f753 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -898,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(inode)->wsize, nbytes; + size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -924,11 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned } while (nbytes != 0); atomic_set(&req->wb_complete, requests); - BUG_ON(lseg); - lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); + BUG_ON(desc->pg_lseg); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -940,7 +941,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, lseg, how); + wsize, offset, lseg, desc->pg_ioflags); if (ret == 0) ret = ret2; offset += wsize; @@ -968,14 +969,17 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret; - data = nfs_writedata_alloc(npages); + data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); if (!data) { while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -995,10 +999,10 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); /* Set up the argument struct */ - ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: put_lseg(lseg); /* Cleans any gotten in ->pg_test */ return ret; -- cgit v1.2.2 From 36fe432d33e078caee5c954e15e929819c2cacae Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:49 +0000 Subject: NFSv4.1: Clear lseg pointer in ->doio function Now that we have access to the pointer, clear it immediately after the put, instead of in caller. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b74200a2f753..47a3ad63e0d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -949,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) } while (nbytes != 0); put_lseg(lseg); + desc->pg_lseg = NULL; return ret; out_bad: @@ -1005,6 +1006,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + desc->pg_lseg = NULL; return ret; } -- cgit v1.2.2 From b31268ac793fd300da66b9c28bbf0a200339ab96 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2011 17:02:00 -0400 Subject: FS: Use stable writes when not doing a bulk flush If we're only doing a single write, and there are no other unstable writes being queued up, we might want to just flip to using a stable write RPC call. Reviewed-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 47a3ad63e0d5..4d686ee53244 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -179,8 +179,8 @@ static int wb_priority(struct writeback_control *wbc) if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI; - return 0; + return FLUSH_LOWPRI | FLUSH_COND_STABLE; + return FLUSH_COND_STABLE; } /* @@ -863,7 +863,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & FLUSH_STABLE) { + if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { data->args.stable = NFS_DATA_SYNC; if (!nfs_need_commit(NFS_I(inode))) data->args.stable = NFS_FILE_SYNC; @@ -912,6 +912,12 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || + desc->pg_count > wsize)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -1002,6 +1008,10 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) if ((!lseg) && list_is_singular(&data->pages)) lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + /* Set up the argument struct */ ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: -- cgit v1.2.2 From b8413f98f997bb3ed7327e6d7117e7e91ce010c3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2011 15:37:01 -0400 Subject: NFS: Fix a hang/infinite loop in nfs_wb_page() When one of the two waits in nfs_commit_inode() is interrupted, it returns a non-negative value, which causes nfs_wb_page() to think that the operation was successful causing it to busy-loop rather than exiting. It also causes nfs_file_fsync() to incorrectly report the file as being successfully committed to disk. This patch fixes both problems by ensuring that we return an error if the attempts to wait fail. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/write.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4d686ee53244..55a8c3671233 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1261,13 +1261,17 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { + int ret; + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) return 1; - if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, nfs_wait_bit_killable, - TASK_KILLABLE)) - return 1; - return 0; + if (!may_wait) + return 0; + ret = out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + return (ret < 0) ? ret : 1; } static void nfs_commit_clear_lock(struct nfs_inode *nfsi) @@ -1443,9 +1447,10 @@ int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; - int res = 0; + int res; - if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + res = nfs_commit_set_lock(NFS_I(inode), may_wait); + if (res <= 0) goto out_mark_dirty; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); @@ -1454,12 +1459,14 @@ int nfs_commit_inode(struct inode *inode, int how) int error = nfs_commit_list(inode, &head, how); if (error < 0) return error; - if (may_wait) - wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - else + if (!may_wait) goto out_mark_dirty; + error = wait_on_bit(&NFS_I(inode)->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (error < 0) + return error; } else nfs_commit_clear_lock(NFS_I(inode)); return res; -- cgit v1.2.2 From 465d52437d5ce8d4eb9da0d3e3818b51cff163a1 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:44 +0000 Subject: NFSv4.1: don't send COMMIT to ds for data sync writes Based on consensus reached in Feb 2011 interim IETF meeting regarding use of LAYOUTCOMMIT, it has been decided that a NFS_DATA_SYNC return from a WRITE to data server should not initiate a COMMIT. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 55a8c3671233..92b4a6614fd3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -474,7 +474,10 @@ nfs_clear_request_commit(struct nfs_page *req) static inline int nfs_write_need_commit(struct nfs_write_data *data) { - return data->verf.committed != NFS_FILE_SYNC; + if (data->verf.committed == NFS_DATA_SYNC) + return data->lseg == NULL; + else + return data->verf.committed != NFS_FILE_SYNC; } static inline -- cgit v1.2.2 From 9ace33cdc6562699a780b4716d9d52c55dd2151a Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:45 +0000 Subject: NFSv4.1: rearrange nfs_commit_rpcsetup Reorder nfs_commit_rpcsetup, preparing for a pnfs entry point. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 60 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 92b4a6614fd3..bf672faa4885 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1293,32 +1293,49 @@ static void nfs_commitdata_release(void *data) nfs_commit_free(wdata); } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_commit_rpcsetup(struct list_head *head, - struct nfs_write_data *data, - int how) +static int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->path.dentry->d_inode; - int priority = flush_task_priority(how); struct rpc_task *task; + int priority = flush_task_priority(how); struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = first->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, - .callback_ops = &nfs_commit_ops, + .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, .priority = priority, }; + /* Set up the initial task struct. */ + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); + rpc_put_task(task); + return 0; +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +static void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = first->wb_context->path.dentry->d_inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1326,7 +1343,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, list_splice_init(head, &data->pages); data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = first->wb_context->cred; + data->mds_ops = &nfs_commit_ops; data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ @@ -1337,19 +1355,6 @@ static int nfs_commit_rpcsetup(struct list_head *head, data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - - /* Set up the initial task struct. */ - NFS_PROTO(inode)->commit_setup(data, &msg); - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - if (how & FLUSH_SYNC) - rpc_wait_for_completion_task(task); - rpc_put_task(task); - return 0; } /* @@ -1367,7 +1372,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) goto out_bad; /* Set up the argument struct */ - return nfs_commit_rpcsetup(head, data, how); + nfs_init_commit(data, head); + return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); -- cgit v1.2.2 From 64bfeb49bd1c2351a8415f7fe6b25eef208872a5 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:47 +0000 Subject: NFSv4.1: pull error handling out of nfs_commit_list Create a separate support function for later use by data server commit code. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bf672faa4885..dbc801810e75 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1357,6 +1357,21 @@ static void nfs_init_commit(struct nfs_write_data *data, nfs_fattr_init(&data->fattr); } +static void nfs_retry_commit(struct list_head *page_list) +{ + struct nfs_page *req; + + while (!list_empty(page_list)) { + req = nfs_list_entry(page_list->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + nfs_clear_page_tag_locked(req); + } +} + /* * Commit dirty pages */ @@ -1364,7 +1379,6 @@ static int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { struct nfs_write_data *data; - struct nfs_page *req; data = nfs_commitdata_alloc(); @@ -1375,15 +1389,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_init_commit(data, head); return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_mark_request_commit(req); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_clear_page_tag_locked(req); - } + nfs_retry_commit(head); nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } -- cgit v1.2.2 From 5917ce8440ba0b3f2adee613b5f1258ac5efff02 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:48 +0000 Subject: NFSv4.1: pull out code from nfs_commit_release Create a separate support function for later use by data server commit code. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index dbc801810e75..f5f005e9db48 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1409,10 +1409,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) return; } -static void nfs_commit_release(void *calldata) +static void nfs_commit_release_pages(struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; - struct nfs_page *req; + struct nfs_page *req; int status = data->task.tk_status; while (!list_empty(&data->pages)) { @@ -1446,6 +1445,13 @@ static void nfs_commit_release(void *calldata) next: nfs_clear_page_tag_locked(req); } +} + +static void nfs_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_commit_release_pages(data); nfs_commit_clear_lock(NFS_I(data->inode)); nfs_commitdata_release(calldata); } -- cgit v1.2.2 From a861a1e1c398fe34701569fd8ac9225dfe0a9a7e Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:51 +0000 Subject: NFSv4.1: add generic layer hooks for pnfs COMMIT We create three major hooks for the pnfs code. pnfs_mark_request_commit() is called during writeback_done from nfs_mark_request_commit, which gives the driver an opportunity to claim it wants control over commiting a particular req. pnfs_choose_commit_list() is called from nfs_scan_list to choose which list a given req should be added to, based on where we intend to send it for COMMIT. It is up to the driver to have preallocated list headers for each destination it may need. pnfs_commit_list() is how the driver actually takes control, it is used instead of nfs_commit_list(). In order to pass information between the above functions, we create a union in nfs_page to hold a lseg (which is possible because the req is not on any list while in transition), and add some flags to indicate if we need to use the pnfs code. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f5f005e9db48..6927a18b6891 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_dirty(struct nfs_page *req) * Add a request to the inode's commit list. */ static void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -453,6 +453,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); nfsi->ncommit++; spin_unlock(&inode->i_lock); + pnfs_mark_request_commit(req, lseg); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -481,10 +482,11 @@ int nfs_write_need_commit(struct nfs_write_data *data) } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); return 1; } if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { @@ -495,7 +497,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) } #else static inline void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { } @@ -512,7 +514,8 @@ int nfs_write_need_commit(struct nfs_write_data *data) } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { return 0; } @@ -615,9 +618,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, } if (nfs_clear_request_commit(req) && - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { NFS_I(inode)->ncommit--; + pnfs_clear_request_commit(req); + } /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -765,11 +770,12 @@ int nfs_updatepage(struct file *file, struct page *page, return status; } -static void nfs_writepage_release(struct nfs_page *req) +static void nfs_writepage_release(struct nfs_page *req, + struct nfs_write_data *data) { struct page *page = req->wb_page; - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) nfs_inode_remove_request(req); nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); @@ -1087,7 +1093,7 @@ static void nfs_writeback_release_partial(void *calldata) out: if (atomic_dec_and_test(&req->wb_complete)) - nfs_writepage_release(req); + nfs_writepage_release(req, data); nfs_writedata_release(calldata); } @@ -1154,7 +1160,7 @@ static void nfs_writeback_release_full(void *calldata) if (nfs_write_need_commit(data)) { memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); dprintk(" marked for commit\n"); goto next; } @@ -1357,14 +1363,15 @@ static void nfs_init_commit(struct nfs_write_data *data, nfs_fattr_init(&data->fattr); } -static void nfs_retry_commit(struct list_head *page_list) +static void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg) { struct nfs_page *req; while (!list_empty(page_list)) { req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, lseg); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); @@ -1389,7 +1396,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_init_commit(data, head); return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: - nfs_retry_commit(head); + nfs_retry_commit(head, NULL); nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1477,7 +1484,11 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error; + + error = pnfs_commit_list(inode, &head, how); + if (error == PNFS_NOT_ATTEMPTED) + error = nfs_commit_list(inode, &head, how); if (error < 0) return error; if (!may_wait) -- cgit v1.2.2 From 988b6dceb0ae6d642587c8594529b94f6be0c5ea Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:52 +0000 Subject: NFSv4.1: remove GETATTR from ds commits Any COMMIT compound directed to a data server needs to have the GETATTR calls suppressed. We here, make sure the field we are testing (data->lseg) is set and refcounted correctly. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6927a18b6891..cae5d160d835 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1295,6 +1295,7 @@ static void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } @@ -1338,7 +1339,8 @@ static int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *cln * Set up the argument/result storage required for the RPC call. */ static void nfs_init_commit(struct nfs_write_data *data, - struct list_head *head) + struct list_head *head, + struct pnfs_layout_segment *lseg) { struct nfs_page *first = nfs_list_entry(head->next); struct inode *inode = first->wb_context->path.dentry->d_inode; @@ -1350,6 +1352,7 @@ static void nfs_init_commit(struct nfs_write_data *data, data->inode = inode; data->cred = first->wb_context->cred; + data->lseg = lseg; /* reference transferred */ data->mds_ops = &nfs_commit_ops; data->args.fh = NFS_FH(data->inode); @@ -1393,7 +1396,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) goto out_bad; /* Set up the argument struct */ - nfs_init_commit(data, head); + nfs_init_commit(data, head, NULL); return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: nfs_retry_commit(head, NULL); -- cgit v1.2.2 From e0c2b3801828aadb65dec9f67f7c6b7a675ad007 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:53 +0000 Subject: NFSv4.1: filelayout driver specific code for COMMIT Implement all the hooks created in the previous patches. This requires exporting quite a few functions and adding a few structure fields. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cae5d160d835..e7aeda0663c5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void) } return p; } +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); void nfs_commit_free(struct nfs_write_data *p) { @@ -66,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +EXPORT_SYMBOL_GPL(nfs_commit_free); struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { @@ -1283,15 +1285,15 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) return (ret < 0) ? ret : 1; } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); smp_mb__after_clear_bit(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } +EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); - -static void nfs_commitdata_release(void *data) +void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1299,8 +1301,9 @@ static void nfs_commitdata_release(void *data) put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } +EXPORT_SYMBOL_GPL(nfs_commitdata_release); -static int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, +int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how) { @@ -1334,11 +1337,12 @@ static int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *cln rpc_put_task(task); return 0; } +EXPORT_SYMBOL_GPL(nfs_initiate_commit); /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_init_commit(struct nfs_write_data *data, +void nfs_init_commit(struct nfs_write_data *data, struct list_head *head, struct pnfs_layout_segment *lseg) { @@ -1365,8 +1369,9 @@ static void nfs_init_commit(struct nfs_write_data *data, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); } +EXPORT_SYMBOL_GPL(nfs_init_commit); -static void nfs_retry_commit(struct list_head *page_list, +void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg) { struct nfs_page *req; @@ -1381,6 +1386,7 @@ static void nfs_retry_commit(struct list_head *page_list, nfs_clear_page_tag_locked(req); } } +EXPORT_SYMBOL_GPL(nfs_retry_commit); /* * Commit dirty pages @@ -1419,7 +1425,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) return; } -static void nfs_commit_release_pages(struct nfs_write_data *data) +void nfs_commit_release_pages(struct nfs_write_data *data) { struct nfs_page *req; int status = data->task.tk_status; @@ -1456,6 +1462,7 @@ static void nfs_commit_release_pages(struct nfs_write_data *data) nfs_clear_page_tag_locked(req); } } +EXPORT_SYMBOL_GPL(nfs_commit_release_pages); static void nfs_commit_release(void *calldata) { -- cgit v1.2.2 From 863a3c6c686d5773f7192a4818769e15db12ce08 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 23 Mar 2011 13:27:54 +0000 Subject: NFSv4.1: layoutcommit The filelayout driver sends LAYOUTCOMMIT only when COMMIT goes to the data server (as opposed to the MDS) and the data server WRITE is not NFS_FILE_SYNC. Only whole file layout support means that there is only one IOMODE_RW layout segment. Signed-off-by: Andy Adamson Signed-off-by: Alexandros Batsakis Signed-off-by: Boaz Harrosh Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Mingyang Guo Signed-off-by: Tao Guo Signed-off-by: Zhang Jingwang Tested-by: Boaz Harrosh Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e7aeda0663c5..a03c11f9081e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1562,7 +1562,20 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return nfs_commit_unstable_pages(inode, wbc); + int ret; + + ret = nfs_commit_unstable_pages(inode, wbc); + if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { + int status, sync = wbc->sync_mode; + + if (wbc->nonblocking || wbc->for_background) + sync = 0; + + status = pnfs_layoutcommit_inode(inode, sync); + if (status < 0) + return status; + } + return ret; } /* -- cgit v1.2.2 From ef31153786bc1e4304e6b9422cc8b9efef455611 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Sat, 12 Mar 2011 02:58:10 -0500 Subject: NFSv4.1 convert layoutcommit sync to boolean Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a03c11f9081e..85d75254328e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1566,10 +1566,12 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) ret = nfs_commit_unstable_pages(inode, wbc); if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { - int status, sync = wbc->sync_mode; + int status; + bool sync = true; - if (wbc->nonblocking || wbc->for_background) - sync = 0; + if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || + wbc->for_background) + sync = false; status = pnfs_layoutcommit_inode(inode, sync); if (status < 0) -- cgit v1.2.2 From 4d65c520fb4abed970069d18c119cfe85624f46d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Mar 2011 14:15:11 -0400 Subject: NFS: Fix a hang in the writeback path Now that the inode scalability patches have been merged, it is no longer safe to call igrab() under the inode->i_lock. Now that we no longer call nfs_clear_request() until the nfs_page is being freed, we know that we are always holding a reference to the nfs_open_context, which again holds a reference to the path, and so the inode cannot be freed until the last nfs_page has been removed from the radix tree and freed. We can therefore skip the igrab()/iput() altogether. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 85d75254328e..af0c6279a4a7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -389,11 +389,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) spin_lock(&inode->i_lock); error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); - if (!nfsi->npages) { - igrab(inode); - if (nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; - } + if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) + nfsi->change_attr++; set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); @@ -423,11 +420,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) clear_bit(PG_MAPPED, &req->wb_flags); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; - if (!nfsi->npages) { - spin_unlock(&inode->i_lock); - iput(inode); - } else - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); nfs_release_request(req); } -- cgit v1.2.2 From 0d88f6e804c824454b5ed0d3034ed3dcf7467a87 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Apr 2011 19:18:08 +1000 Subject: nfs: don't call __mark_inode_dirty while holding i_lock nfs_scan_commit() is called with the inode->i_lock held, but it then calls __mark_inode_dirty() while still holding the lock. This causes a deadlock. Push the inode->i_lock into nfs_scan_commit() so it can protect only the parts of the code it needs to and can be dropped before the call to __mark_inode_dirty() to avoid the deadlock. Signed-off-by: Dave Chinner Tested-by: Will Simoneau Signed-off-by: Linus Torvalds --- fs/nfs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index af0c6279a4a7..e4cbc11a74ab 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -542,11 +542,15 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u if (!nfs_need_commit(nfsi)) return 0; + spin_lock(&inode->i_lock); ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); if (ret > 0) nfsi->ncommit -= ret; + spin_unlock(&inode->i_lock); + if (nfs_need_commit(NFS_I(inode))) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; } #else @@ -1483,9 +1487,7 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_commit_set_lock(NFS_I(inode), may_wait); if (res <= 0) goto out_mark_dirty; - spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&inode->i_lock); if (res) { int error; -- cgit v1.2.2 From 4b38a6db01b09198f4045661815a0039c3d80660 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 11 Apr 2011 11:56:23 -0400 Subject: NFS: Eliminate duplicate call to nfs_mark_request_dirty We only need to call nfs_mark_request_dirty() once in nfs_writepage_setup(). Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e4cbc11a74ab..bb608186b099 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -680,7 +680,6 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, req = nfs_setup_write_request(ctx, page, offset, count); if (IS_ERR(req)) return PTR_ERR(req); - nfs_mark_request_dirty(req); /* Update file length */ nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); -- cgit v1.2.2 From c0d0e96b840dcc73f9b9d45bf8f405dbce72a079 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Apr 2011 12:29:15 -0400 Subject: NFS: Get rid of pointless test in nfs_commit_done Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb608186b099..3bd5d7e80f6c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1417,8 +1417,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) task->tk_pid, task->tk_status); /* Call the NFS version-specific code */ - if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) - return; + NFS_PROTO(data->inode)->commit_done(task, data); } void nfs_commit_release_pages(struct nfs_write_data *data) -- cgit v1.2.2 From a75b9df9d3bfc3cd1083974c045ae31ce5f3434f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 May 2011 18:00:51 -0400 Subject: NFSv4.1: Ensure that layoutget uses the correct gfp modes Currently, writebacks may end up recursing back into the filesystem due to GFP_KERNEL direct reclaims in the pnfs subsystem. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3bd5d7e80f6c..49c715b4ac92 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -939,7 +939,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) atomic_set(&req->wb_complete, requests); BUG_ON(desc->pg_lseg); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -1013,7 +1013,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) -- cgit v1.2.2