aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/write.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-10 18:02:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-10 18:02:42 -0400
commitd1e1cda862c16252087374ac75949b0e89a5717e (patch)
tree544ce467bed23638949a1991b4f7b00e7472baa4 /fs/nfs/write.c
parent07888238f55056605cd23aa4ea3ca97d5e15938f (diff)
parenta914722f333b3359d2f4f12919380a334176bb89 (diff)
Merge tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: - massive cleanup of the NFS read/write code by Anna and Dros - support multiple NFS read/write requests per page in order to deal with non-page aligned pNFS striping. Also cleans up the r/wsize < page size code nicely. - stable fix for ensuring inode is declared uptodate only after all the attributes have been checked. - stable fix for a kernel Oops when remounting - NFS over RDMA client fixes - move the pNFS files layout driver into its own subdirectory" * tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits) NFS: populate ->net in mount data when remounting pnfs: fix lockup caused by pnfs_generic_pg_test NFSv4.1: Fix typo in dprintk NFSv4.1: Comment is now wrong and redundant to code NFS: Use raw_write_seqcount_begin/end int nfs4_reclaim_open_state xprtrdma: Disconnect on registration failure xprtrdma: Remove BUG_ON() call sites xprtrdma: Avoid deadlock when credit window is reset SUNRPC: Move congestion window constants to header file xprtrdma: Reset connection timeout after successful reconnect xprtrdma: Use macros for reconnection timeout constants xprtrdma: Allocate missing pagelist xprtrdma: Remove Tavor MTU setting xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting xprtrdma: Reduce the number of hardway buffer allocations xprtrdma: Limit work done by completion handler xprtrmda: Reduce calls to ib_poll_cq() in completion handlers xprtrmda: Reduce lock contention in completion handlers xprtrdma: Split the completion queue xprtrdma: Make rpcrdma_ep_destroy() return void ...
Diffstat (limited to 'fs/nfs/write.c')
-rw-r--r--fs/nfs/write.c588
1 files changed, 188 insertions, 400 deletions
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ffb9459f180b..3ee5af4e738e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -42,10 +42,10 @@
42 * Local function declarations 42 * Local function declarations
43 */ 43 */
44static void nfs_redirty_request(struct nfs_page *req); 44static void nfs_redirty_request(struct nfs_page *req);
45static const struct rpc_call_ops nfs_write_common_ops;
46static const struct rpc_call_ops nfs_commit_ops; 45static const struct rpc_call_ops nfs_commit_ops;
47static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; 46static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
48static const struct nfs_commit_completion_ops nfs_commit_completion_ops; 47static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
48static const struct nfs_rw_ops nfs_rw_write_ops;
49 49
50static struct kmem_cache *nfs_wdata_cachep; 50static struct kmem_cache *nfs_wdata_cachep;
51static mempool_t *nfs_wdata_mempool; 51static mempool_t *nfs_wdata_mempool;
@@ -70,76 +70,19 @@ void nfs_commit_free(struct nfs_commit_data *p)
70} 70}
71EXPORT_SYMBOL_GPL(nfs_commit_free); 71EXPORT_SYMBOL_GPL(nfs_commit_free);
72 72
73struct nfs_write_header *nfs_writehdr_alloc(void) 73static struct nfs_rw_header *nfs_writehdr_alloc(void)
74{ 74{
75 struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); 75 struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
76
77 if (p) {
78 struct nfs_pgio_header *hdr = &p->header;
79 76
77 if (p)
80 memset(p, 0, sizeof(*p)); 78 memset(p, 0, sizeof(*p));
81 INIT_LIST_HEAD(&hdr->pages);
82 INIT_LIST_HEAD(&hdr->rpc_list);
83 spin_lock_init(&hdr->lock);
84 atomic_set(&hdr->refcnt, 0);
85 hdr->verf = &p->verf;
86 }
87 return p; 79 return p;
88} 80}
89EXPORT_SYMBOL_GPL(nfs_writehdr_alloc);
90
91static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
92 unsigned int pagecount)
93{
94 struct nfs_write_data *data, *prealloc;
95
96 prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
97 if (prealloc->header == NULL)
98 data = prealloc;
99 else
100 data = kzalloc(sizeof(*data), GFP_KERNEL);
101 if (!data)
102 goto out;
103
104 if (nfs_pgarray_set(&data->pages, pagecount)) {
105 data->header = hdr;
106 atomic_inc(&hdr->refcnt);
107 } else {
108 if (data != prealloc)
109 kfree(data);
110 data = NULL;
111 }
112out:
113 return data;
114}
115 81
116void nfs_writehdr_free(struct nfs_pgio_header *hdr) 82static void nfs_writehdr_free(struct nfs_rw_header *whdr)
117{ 83{
118 struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
119 mempool_free(whdr, nfs_wdata_mempool); 84 mempool_free(whdr, nfs_wdata_mempool);
120} 85}
121EXPORT_SYMBOL_GPL(nfs_writehdr_free);
122
123void nfs_writedata_release(struct nfs_write_data *wdata)
124{
125 struct nfs_pgio_header *hdr = wdata->header;
126 struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
127
128 put_nfs_open_context(wdata->args.context);
129 if (wdata->pages.pagevec != wdata->pages.page_array)
130 kfree(wdata->pages.pagevec);
131 if (wdata == &write_header->rpc_data) {
132 wdata->header = NULL;
133 wdata = NULL;
134 }
135 if (atomic_dec_and_test(&hdr->refcnt))
136 hdr->completion_ops->completion(hdr);
137 /* Note: we only free the rpc_task after callbacks are done.
138 * See the comment in rpc_free_task() for why
139 */
140 kfree(wdata);
141}
142EXPORT_SYMBOL_GPL(nfs_writedata_release);
143 86
144static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) 87static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
145{ 88{
@@ -211,18 +154,78 @@ static void nfs_set_pageerror(struct page *page)
211 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); 154 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
212} 155}
213 156
157/*
158 * nfs_page_group_search_locked
159 * @head - head request of page group
160 * @page_offset - offset into page
161 *
162 * Search page group with head @head to find a request that contains the
163 * page offset @page_offset.
164 *
165 * Returns a pointer to the first matching nfs request, or NULL if no
166 * match is found.
167 *
168 * Must be called with the page group lock held
169 */
170static struct nfs_page *
171nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
172{
173 struct nfs_page *req;
174
175 WARN_ON_ONCE(head != head->wb_head);
176 WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
177
178 req = head;
179 do {
180 if (page_offset >= req->wb_pgbase &&
181 page_offset < (req->wb_pgbase + req->wb_bytes))
182 return req;
183
184 req = req->wb_this_page;
185 } while (req != head);
186
187 return NULL;
188}
189
190/*
191 * nfs_page_group_covers_page
192 * @head - head request of page group
193 *
194 * Return true if the page group with head @head covers the whole page,
195 * returns false otherwise
196 */
197static bool nfs_page_group_covers_page(struct nfs_page *req)
198{
199 struct nfs_page *tmp;
200 unsigned int pos = 0;
201 unsigned int len = nfs_page_length(req->wb_page);
202
203 nfs_page_group_lock(req);
204
205 do {
206 tmp = nfs_page_group_search_locked(req->wb_head, pos);
207 if (tmp) {
208 /* no way this should happen */
209 WARN_ON_ONCE(tmp->wb_pgbase != pos);
210 pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
211 }
212 } while (tmp && pos < len);
213
214 nfs_page_group_unlock(req);
215 WARN_ON_ONCE(pos > len);
216 return pos == len;
217}
218
214/* We can set the PG_uptodate flag if we see that a write request 219/* We can set the PG_uptodate flag if we see that a write request
215 * covers the full page. 220 * covers the full page.
216 */ 221 */
217static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) 222static void nfs_mark_uptodate(struct nfs_page *req)
218{ 223{
219 if (PageUptodate(page)) 224 if (PageUptodate(req->wb_page))
220 return;
221 if (base != 0)
222 return; 225 return;
223 if (count != nfs_page_length(page)) 226 if (!nfs_page_group_covers_page(req))
224 return; 227 return;
225 SetPageUptodate(page); 228 SetPageUptodate(req->wb_page);
226} 229}
227 230
228static int wb_priority(struct writeback_control *wbc) 231static int wb_priority(struct writeback_control *wbc)
@@ -258,12 +261,15 @@ static void nfs_set_page_writeback(struct page *page)
258 } 261 }
259} 262}
260 263
261static void nfs_end_page_writeback(struct page *page) 264static void nfs_end_page_writeback(struct nfs_page *req)
262{ 265{
263 struct inode *inode = page_file_mapping(page)->host; 266 struct inode *inode = page_file_mapping(req->wb_page)->host;
264 struct nfs_server *nfss = NFS_SERVER(inode); 267 struct nfs_server *nfss = NFS_SERVER(inode);
265 268
266 end_page_writeback(page); 269 if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
270 return;
271
272 end_page_writeback(req->wb_page);
267 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 273 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
268 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 274 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
269} 275}
@@ -354,10 +360,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
354 struct nfs_pageio_descriptor pgio; 360 struct nfs_pageio_descriptor pgio;
355 int err; 361 int err;
356 362
357 NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, 363 nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
358 page->mapping->host, 364 false, &nfs_async_write_completion_ops);
359 wb_priority(wbc),
360 &nfs_async_write_completion_ops);
361 err = nfs_do_writepage(page, wbc, &pgio); 365 err = nfs_do_writepage(page, wbc, &pgio);
362 nfs_pageio_complete(&pgio); 366 nfs_pageio_complete(&pgio);
363 if (err < 0) 367 if (err < 0)
@@ -400,7 +404,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
400 404
401 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 405 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
402 406
403 NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops); 407 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
408 &nfs_async_write_completion_ops);
404 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 409 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
405 nfs_pageio_complete(&pgio); 410 nfs_pageio_complete(&pgio);
406 411
@@ -425,6 +430,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
425{ 430{
426 struct nfs_inode *nfsi = NFS_I(inode); 431 struct nfs_inode *nfsi = NFS_I(inode);
427 432
433 WARN_ON_ONCE(req->wb_this_page != req);
434
428 /* Lock the request! */ 435 /* Lock the request! */
429 nfs_lock_request(req); 436 nfs_lock_request(req);
430 437
@@ -441,6 +448,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
441 set_page_private(req->wb_page, (unsigned long)req); 448 set_page_private(req->wb_page, (unsigned long)req);
442 } 449 }
443 nfsi->npages++; 450 nfsi->npages++;
451 set_bit(PG_INODE_REF, &req->wb_flags);
444 kref_get(&req->wb_kref); 452 kref_get(&req->wb_kref);
445 spin_unlock(&inode->i_lock); 453 spin_unlock(&inode->i_lock);
446} 454}
@@ -452,15 +460,20 @@ static void nfs_inode_remove_request(struct nfs_page *req)
452{ 460{
453 struct inode *inode = req->wb_context->dentry->d_inode; 461 struct inode *inode = req->wb_context->dentry->d_inode;
454 struct nfs_inode *nfsi = NFS_I(inode); 462 struct nfs_inode *nfsi = NFS_I(inode);
463 struct nfs_page *head;
455 464
456 spin_lock(&inode->i_lock); 465 if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
457 if (likely(!PageSwapCache(req->wb_page))) { 466 head = req->wb_head;
458 set_page_private(req->wb_page, 0); 467
459 ClearPagePrivate(req->wb_page); 468 spin_lock(&inode->i_lock);
460 clear_bit(PG_MAPPED, &req->wb_flags); 469 if (likely(!PageSwapCache(head->wb_page))) {
470 set_page_private(head->wb_page, 0);
471 ClearPagePrivate(head->wb_page);
472 clear_bit(PG_MAPPED, &head->wb_flags);
473 }
474 nfsi->npages--;
475 spin_unlock(&inode->i_lock);
461 } 476 }
462 nfsi->npages--;
463 spin_unlock(&inode->i_lock);
464 nfs_release_request(req); 477 nfs_release_request(req);
465} 478}
466 479
@@ -583,7 +596,7 @@ nfs_clear_request_commit(struct nfs_page *req)
583} 596}
584 597
585static inline 598static inline
586int nfs_write_need_commit(struct nfs_write_data *data) 599int nfs_write_need_commit(struct nfs_pgio_data *data)
587{ 600{
588 if (data->verf.committed == NFS_DATA_SYNC) 601 if (data->verf.committed == NFS_DATA_SYNC)
589 return data->header->lseg == NULL; 602 return data->header->lseg == NULL;
@@ -614,7 +627,7 @@ nfs_clear_request_commit(struct nfs_page *req)
614} 627}
615 628
616static inline 629static inline
617int nfs_write_need_commit(struct nfs_write_data *data) 630int nfs_write_need_commit(struct nfs_pgio_data *data)
618{ 631{
619 return 0; 632 return 0;
620} 633}
@@ -625,6 +638,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
625{ 638{
626 struct nfs_commit_info cinfo; 639 struct nfs_commit_info cinfo;
627 unsigned long bytes = 0; 640 unsigned long bytes = 0;
641 bool do_destroy;
628 642
629 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) 643 if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
630 goto out; 644 goto out;
@@ -645,7 +659,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
645 goto next; 659 goto next;
646 } 660 }
647 if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { 661 if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
648 memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf)); 662 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
649 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 663 nfs_mark_request_commit(req, hdr->lseg, &cinfo);
650 goto next; 664 goto next;
651 } 665 }
@@ -653,7 +667,8 @@ remove_req:
653 nfs_inode_remove_request(req); 667 nfs_inode_remove_request(req);
654next: 668next:
655 nfs_unlock_request(req); 669 nfs_unlock_request(req);
656 nfs_end_page_writeback(req->wb_page); 670 nfs_end_page_writeback(req);
671 do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
657 nfs_release_request(req); 672 nfs_release_request(req);
658 } 673 }
659out: 674out:
@@ -661,7 +676,7 @@ out:
661} 676}
662 677
663#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 678#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
664static unsigned long 679unsigned long
665nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 680nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
666{ 681{
667 return cinfo->mds->ncommit; 682 return cinfo->mds->ncommit;
@@ -718,7 +733,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
718} 733}
719 734
720#else 735#else
721static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 736unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
722{ 737{
723 return 0; 738 return 0;
724} 739}
@@ -758,6 +773,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
758 if (req == NULL) 773 if (req == NULL)
759 goto out_unlock; 774 goto out_unlock;
760 775
776 /* should be handled by nfs_flush_incompatible */
777 WARN_ON_ONCE(req->wb_head != req);
778 WARN_ON_ONCE(req->wb_this_page != req);
779
761 rqend = req->wb_offset + req->wb_bytes; 780 rqend = req->wb_offset + req->wb_bytes;
762 /* 781 /*
763 * Tell the caller to flush out the request if 782 * Tell the caller to flush out the request if
@@ -819,7 +838,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
819 req = nfs_try_to_update_request(inode, page, offset, bytes); 838 req = nfs_try_to_update_request(inode, page, offset, bytes);
820 if (req != NULL) 839 if (req != NULL)
821 goto out; 840 goto out;
822 req = nfs_create_request(ctx, inode, page, offset, bytes); 841 req = nfs_create_request(ctx, page, NULL, offset, bytes);
823 if (IS_ERR(req)) 842 if (IS_ERR(req))
824 goto out; 843 goto out;
825 nfs_inode_add_request(inode, req); 844 nfs_inode_add_request(inode, req);
@@ -837,7 +856,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
837 return PTR_ERR(req); 856 return PTR_ERR(req);
838 /* Update file length */ 857 /* Update file length */
839 nfs_grow_file(page, offset, count); 858 nfs_grow_file(page, offset, count);
840 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 859 nfs_mark_uptodate(req);
841 nfs_mark_request_dirty(req); 860 nfs_mark_request_dirty(req);
842 nfs_unlock_and_release_request(req); 861 nfs_unlock_and_release_request(req);
843 return 0; 862 return 0;
@@ -863,6 +882,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
863 return 0; 882 return 0;
864 l_ctx = req->wb_lock_context; 883 l_ctx = req->wb_lock_context;
865 do_flush = req->wb_page != page || req->wb_context != ctx; 884 do_flush = req->wb_page != page || req->wb_context != ctx;
885 /* for now, flush if more than 1 request in page_group */
886 do_flush |= req->wb_this_page != req;
866 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { 887 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
867 do_flush |= l_ctx->lockowner.l_owner != current->files 888 do_flush |= l_ctx->lockowner.l_owner != current->files
868 || l_ctx->lockowner.l_pid != current->tgid; 889 || l_ctx->lockowner.l_pid != current->tgid;
@@ -990,126 +1011,17 @@ static int flush_task_priority(int how)
990 return RPC_PRIORITY_NORMAL; 1011 return RPC_PRIORITY_NORMAL;
991} 1012}
992 1013
993int nfs_initiate_write(struct rpc_clnt *clnt, 1014static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
994 struct nfs_write_data *data, 1015 struct rpc_task_setup *task_setup_data, int how)
995 const struct rpc_call_ops *call_ops,
996 int how, int flags)
997{ 1016{
998 struct inode *inode = data->header->inode; 1017 struct inode *inode = data->header->inode;
999 int priority = flush_task_priority(how); 1018 int priority = flush_task_priority(how);
1000 struct rpc_task *task;
1001 struct rpc_message msg = {
1002 .rpc_argp = &data->args,
1003 .rpc_resp = &data->res,
1004 .rpc_cred = data->header->cred,
1005 };
1006 struct rpc_task_setup task_setup_data = {
1007 .rpc_client = clnt,
1008 .task = &data->task,
1009 .rpc_message = &msg,
1010 .callback_ops = call_ops,
1011 .callback_data = data,
1012 .workqueue = nfsiod_workqueue,
1013 .flags = RPC_TASK_ASYNC | flags,
1014 .priority = priority,
1015 };
1016 int ret = 0;
1017
1018 /* Set up the initial task struct. */
1019 NFS_PROTO(inode)->write_setup(data, &msg);
1020 1019
1021 dprintk("NFS: %5u initiated write call " 1020 task_setup_data->priority = priority;
1022 "(req %s/%llu, %u bytes @ offset %llu)\n", 1021 NFS_PROTO(inode)->write_setup(data, msg);
1023 data->task.tk_pid,
1024 inode->i_sb->s_id,
1025 (unsigned long long)NFS_FILEID(inode),
1026 data->args.count,
1027 (unsigned long long)data->args.offset);
1028 1022
1029 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, 1023 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
1030 &task_setup_data.rpc_client, &msg, data); 1024 &task_setup_data->rpc_client, msg, data);
1031
1032 task = rpc_run_task(&task_setup_data);
1033 if (IS_ERR(task)) {
1034 ret = PTR_ERR(task);
1035 goto out;
1036 }
1037 if (how & FLUSH_SYNC) {
1038 ret = rpc_wait_for_completion_task(task);
1039 if (ret == 0)
1040 ret = task->tk_status;
1041 }
1042 rpc_put_task(task);
1043out:
1044 return ret;
1045}
1046EXPORT_SYMBOL_GPL(nfs_initiate_write);
1047
1048/*
1049 * Set up the argument/result storage required for the RPC call.
1050 */
1051static void nfs_write_rpcsetup(struct nfs_write_data *data,
1052 unsigned int count, unsigned int offset,
1053 int how, struct nfs_commit_info *cinfo)
1054{
1055 struct nfs_page *req = data->header->req;
1056
1057 /* Set up the RPC argument and reply structs
1058 * NB: take care not to mess about with data->commit et al. */
1059
1060 data->args.fh = NFS_FH(data->header->inode);
1061 data->args.offset = req_offset(req) + offset;
1062 /* pnfs_set_layoutcommit needs this */
1063 data->mds_offset = data->args.offset;
1064 data->args.pgbase = req->wb_pgbase + offset;
1065 data->args.pages = data->pages.pagevec;
1066 data->args.count = count;
1067 data->args.context = get_nfs_open_context(req->wb_context);
1068 data->args.lock_context = req->wb_lock_context;
1069 data->args.stable = NFS_UNSTABLE;
1070 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
1071 case 0:
1072 break;
1073 case FLUSH_COND_STABLE:
1074 if (nfs_reqs_to_commit(cinfo))
1075 break;
1076 default:
1077 data->args.stable = NFS_FILE_SYNC;
1078 }
1079
1080 data->res.fattr = &data->fattr;
1081 data->res.count = count;
1082 data->res.verf = &data->verf;
1083 nfs_fattr_init(&data->fattr);
1084}
1085
1086static int nfs_do_write(struct nfs_write_data *data,
1087 const struct rpc_call_ops *call_ops,
1088 int how)
1089{
1090 struct inode *inode = data->header->inode;
1091
1092 return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
1093}
1094
1095static int nfs_do_multiple_writes(struct list_head *head,
1096 const struct rpc_call_ops *call_ops,
1097 int how)
1098{
1099 struct nfs_write_data *data;
1100 int ret = 0;
1101
1102 while (!list_empty(head)) {
1103 int ret2;
1104
1105 data = list_first_entry(head, struct nfs_write_data, list);
1106 list_del_init(&data->list);
1107
1108 ret2 = nfs_do_write(data, call_ops, how);
1109 if (ret == 0)
1110 ret = ret2;
1111 }
1112 return ret;
1113} 1025}
1114 1026
1115/* If a nfs_flush_* function fails, it should remove reqs from @head and 1027/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1120,7 +1032,7 @@ static void nfs_redirty_request(struct nfs_page *req)
1120{ 1032{
1121 nfs_mark_request_dirty(req); 1033 nfs_mark_request_dirty(req);
1122 nfs_unlock_request(req); 1034 nfs_unlock_request(req);
1123 nfs_end_page_writeback(req->wb_page); 1035 nfs_end_page_writeback(req);
1124 nfs_release_request(req); 1036 nfs_release_request(req);
1125} 1037}
1126 1038
@@ -1140,173 +1052,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
1140 .completion = nfs_write_completion, 1052 .completion = nfs_write_completion,
1141}; 1053};
1142 1054
1143static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
1144 struct nfs_pgio_header *hdr)
1145{
1146 set_bit(NFS_IOHDR_REDO, &hdr->flags);
1147 while (!list_empty(&hdr->rpc_list)) {
1148 struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
1149 struct nfs_write_data, list);
1150 list_del(&data->list);
1151 nfs_writedata_release(data);
1152 }
1153 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1154}
1155
1156/*
1157 * Generate multiple small requests to write out a single
1158 * contiguous dirty area on one page.
1159 */
1160static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
1161 struct nfs_pgio_header *hdr)
1162{
1163 struct nfs_page *req = hdr->req;
1164 struct page *page = req->wb_page;
1165 struct nfs_write_data *data;
1166 size_t wsize = desc->pg_bsize, nbytes;
1167 unsigned int offset;
1168 int requests = 0;
1169 struct nfs_commit_info cinfo;
1170
1171 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
1172
1173 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1174 (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
1175 desc->pg_count > wsize))
1176 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
1177
1178
1179 offset = 0;
1180 nbytes = desc->pg_count;
1181 do {
1182 size_t len = min(nbytes, wsize);
1183
1184 data = nfs_writedata_alloc(hdr, 1);
1185 if (!data) {
1186 nfs_flush_error(desc, hdr);
1187 return -ENOMEM;
1188 }
1189 data->pages.pagevec[0] = page;
1190 nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
1191 list_add(&data->list, &hdr->rpc_list);
1192 requests++;
1193 nbytes -= len;
1194 offset += len;
1195 } while (nbytes != 0);
1196 nfs_list_remove_request(req);
1197 nfs_list_add_request(req, &hdr->pages);
1198 desc->pg_rpc_callops = &nfs_write_common_ops;
1199 return 0;
1200}
1201
1202/*
1203 * Create an RPC task for the given write request and kick it.
1204 * The page must have been locked by the caller.
1205 *
1206 * It may happen that the page we're passed is not marked dirty.
1207 * This is the case if nfs_updatepage detects a conflicting request
1208 * that has been written but not committed.
1209 */
1210static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
1211 struct nfs_pgio_header *hdr)
1212{
1213 struct nfs_page *req;
1214 struct page **pages;
1215 struct nfs_write_data *data;
1216 struct list_head *head = &desc->pg_list;
1217 struct nfs_commit_info cinfo;
1218
1219 data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
1220 desc->pg_count));
1221 if (!data) {
1222 nfs_flush_error(desc, hdr);
1223 return -ENOMEM;
1224 }
1225
1226 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
1227 pages = data->pages.pagevec;
1228 while (!list_empty(head)) {
1229 req = nfs_list_entry(head->next);
1230 nfs_list_remove_request(req);
1231 nfs_list_add_request(req, &hdr->pages);
1232 *pages++ = req->wb_page;
1233 }
1234
1235 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1236 (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
1237 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
1238
1239 /* Set up the argument struct */
1240 nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
1241 list_add(&data->list, &hdr->rpc_list);
1242 desc->pg_rpc_callops = &nfs_write_common_ops;
1243 return 0;
1244}
1245
1246int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
1247 struct nfs_pgio_header *hdr)
1248{
1249 if (desc->pg_bsize < PAGE_CACHE_SIZE)
1250 return nfs_flush_multi(desc, hdr);
1251 return nfs_flush_one(desc, hdr);
1252}
1253EXPORT_SYMBOL_GPL(nfs_generic_flush);
1254
1255static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1256{
1257 struct nfs_write_header *whdr;
1258 struct nfs_pgio_header *hdr;
1259 int ret;
1260
1261 whdr = nfs_writehdr_alloc();
1262 if (!whdr) {
1263 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1264 return -ENOMEM;
1265 }
1266 hdr = &whdr->header;
1267 nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
1268 atomic_inc(&hdr->refcnt);
1269 ret = nfs_generic_flush(desc, hdr);
1270 if (ret == 0)
1271 ret = nfs_do_multiple_writes(&hdr->rpc_list,
1272 desc->pg_rpc_callops,
1273 desc->pg_ioflags);
1274 if (atomic_dec_and_test(&hdr->refcnt))
1275 hdr->completion_ops->completion(hdr);
1276 return ret;
1277}
1278
1279static const struct nfs_pageio_ops nfs_pageio_write_ops = {
1280 .pg_test = nfs_generic_pg_test,
1281 .pg_doio = nfs_generic_pg_writepages,
1282};
1283
1284void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1055void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1285 struct inode *inode, int ioflags, 1056 struct inode *inode, int ioflags, bool force_mds,
1286 const struct nfs_pgio_completion_ops *compl_ops) 1057 const struct nfs_pgio_completion_ops *compl_ops)
1287{ 1058{
1288 nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, 1059 struct nfs_server *server = NFS_SERVER(inode);
1289 NFS_SERVER(inode)->wsize, ioflags); 1060 const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
1061
1062#ifdef CONFIG_NFS_V4_1
1063 if (server->pnfs_curr_ld && !force_mds)
1064 pg_ops = server->pnfs_curr_ld->pg_write_ops;
1065#endif
1066 nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
1067 server->wsize, ioflags);
1290} 1068}
1291EXPORT_SYMBOL_GPL(nfs_pageio_init_write); 1069EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
1292 1070
1293void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) 1071void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
1294{ 1072{
1295 pgio->pg_ops = &nfs_pageio_write_ops; 1073 pgio->pg_ops = &nfs_pgio_rw_ops;
1296 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; 1074 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
1297} 1075}
1298EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); 1076EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1299 1077
1300 1078
1301void nfs_write_prepare(struct rpc_task *task, void *calldata)
1302{
1303 struct nfs_write_data *data = calldata;
1304 int err;
1305 err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
1306 if (err)
1307 rpc_exit(task, err);
1308}
1309
1310void nfs_commit_prepare(struct rpc_task *task, void *calldata) 1079void nfs_commit_prepare(struct rpc_task *task, void *calldata)
1311{ 1080{
1312 struct nfs_commit_data *data = calldata; 1081 struct nfs_commit_data *data = calldata;
@@ -1314,23 +1083,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
1314 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); 1083 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
1315} 1084}
1316 1085
1317/* 1086static void nfs_writeback_release_common(struct nfs_pgio_data *data)
1318 * Handle a write reply that flushes a whole page.
1319 *
1320 * FIXME: There is an inherent race with invalidate_inode_pages and
1321 * writebacks since the page->count is kept > 1 for as long
1322 * as the page has a write request pending.
1323 */
1324static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
1325{
1326 struct nfs_write_data *data = calldata;
1327
1328 nfs_writeback_done(task, data);
1329}
1330
1331static void nfs_writeback_release_common(void *calldata)
1332{ 1087{
1333 struct nfs_write_data *data = calldata;
1334 struct nfs_pgio_header *hdr = data->header; 1088 struct nfs_pgio_header *hdr = data->header;
1335 int status = data->task.tk_status; 1089 int status = data->task.tk_status;
1336 1090
@@ -1339,34 +1093,46 @@ static void nfs_writeback_release_common(void *calldata)
1339 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) 1093 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
1340 ; /* Do nothing */ 1094 ; /* Do nothing */
1341 else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) 1095 else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
1342 memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); 1096 memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
1343 else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) 1097 else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
1344 set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); 1098 set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
1345 spin_unlock(&hdr->lock); 1099 spin_unlock(&hdr->lock);
1346 } 1100 }
1347 nfs_writedata_release(data);
1348} 1101}
1349 1102
1350static const struct rpc_call_ops nfs_write_common_ops = { 1103/*
1351 .rpc_call_prepare = nfs_write_prepare, 1104 * Special version of should_remove_suid() that ignores capabilities.
1352 .rpc_call_done = nfs_writeback_done_common, 1105 */
1353 .rpc_release = nfs_writeback_release_common, 1106static int nfs_should_remove_suid(const struct inode *inode)
1354}; 1107{
1108 umode_t mode = inode->i_mode;
1109 int kill = 0;
1110
1111 /* suid always must be killed */
1112 if (unlikely(mode & S_ISUID))
1113 kill = ATTR_KILL_SUID;
1355 1114
1115 /*
1116 * sgid without any exec bits is just a mandatory locking mark; leave
1117 * it alone. If some exec bits are set, it's a real sgid; kill it.
1118 */
1119 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1120 kill |= ATTR_KILL_SGID;
1121
1122 if (unlikely(kill && S_ISREG(mode)))
1123 return kill;
1124
1125 return 0;
1126}
1356 1127
1357/* 1128/*
1358 * This function is called when the WRITE call is complete. 1129 * This function is called when the WRITE call is complete.
1359 */ 1130 */
1360void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) 1131static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1132 struct inode *inode)
1361{ 1133{
1362 struct nfs_writeargs *argp = &data->args;
1363 struct nfs_writeres *resp = &data->res;
1364 struct inode *inode = data->header->inode;
1365 int status; 1134 int status;
1366 1135
1367 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
1368 task->tk_pid, task->tk_status);
1369
1370 /* 1136 /*
1371 * ->write_done will attempt to use post-op attributes to detect 1137 * ->write_done will attempt to use post-op attributes to detect
1372 * conflicting writes by other clients. A strict interpretation 1138 * conflicting writes by other clients. A strict interpretation
@@ -1376,11 +1142,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1376 */ 1142 */
1377 status = NFS_PROTO(inode)->write_done(task, data); 1143 status = NFS_PROTO(inode)->write_done(task, data);
1378 if (status != 0) 1144 if (status != 0)
1379 return; 1145 return status;
1380 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); 1146 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
1381 1147
1382#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 1148#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1383 if (resp->verf->committed < argp->stable && task->tk_status >= 0) { 1149 if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
1384 /* We tried a write call, but the server did not 1150 /* We tried a write call, but the server did not
1385 * commit data to stable storage even though we 1151 * commit data to stable storage even though we
1386 * requested it. 1152 * requested it.
@@ -1396,18 +1162,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1396 dprintk("NFS: faulty NFS server %s:" 1162 dprintk("NFS: faulty NFS server %s:"
1397 " (committed = %d) != (stable = %d)\n", 1163 " (committed = %d) != (stable = %d)\n",
1398 NFS_SERVER(inode)->nfs_client->cl_hostname, 1164 NFS_SERVER(inode)->nfs_client->cl_hostname,
1399 resp->verf->committed, argp->stable); 1165 data->res.verf->committed, data->args.stable);
1400 complain = jiffies + 300 * HZ; 1166 complain = jiffies + 300 * HZ;
1401 } 1167 }
1402 } 1168 }
1403#endif 1169#endif
1404 if (task->tk_status < 0) 1170
1405 nfs_set_pgio_error(data->header, task->tk_status, argp->offset); 1171 /* Deal with the suid/sgid bit corner case */
1406 else if (resp->count < argp->count) { 1172 if (nfs_should_remove_suid(inode))
1173 nfs_mark_for_revalidate(inode);
1174 return 0;
1175}
1176
1177/*
1178 * This function is called when the WRITE call is complete.
1179 */
1180static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
1181{
1182 struct nfs_pgio_args *argp = &data->args;
1183 struct nfs_pgio_res *resp = &data->res;
1184
1185 if (resp->count < argp->count) {
1407 static unsigned long complain; 1186 static unsigned long complain;
1408 1187
1409 /* This a short write! */ 1188 /* This a short write! */
1410 nfs_inc_stats(inode, NFSIOS_SHORTWRITE); 1189 nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
1411 1190
1412 /* Has the server at least made some progress? */ 1191 /* Has the server at least made some progress? */
1413 if (resp->count == 0) { 1192 if (resp->count == 0) {
@@ -1874,7 +1653,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1874int __init nfs_init_writepagecache(void) 1653int __init nfs_init_writepagecache(void)
1875{ 1654{
1876 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1655 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
1877 sizeof(struct nfs_write_header), 1656 sizeof(struct nfs_rw_header),
1878 0, SLAB_HWCACHE_ALIGN, 1657 0, SLAB_HWCACHE_ALIGN,
1879 NULL); 1658 NULL);
1880 if (nfs_wdata_cachep == NULL) 1659 if (nfs_wdata_cachep == NULL)
@@ -1936,3 +1715,12 @@ void nfs_destroy_writepagecache(void)
1936 kmem_cache_destroy(nfs_wdata_cachep); 1715 kmem_cache_destroy(nfs_wdata_cachep);
1937} 1716}
1938 1717
1718static const struct nfs_rw_ops nfs_rw_write_ops = {
1719 .rw_mode = FMODE_WRITE,
1720 .rw_alloc_header = nfs_writehdr_alloc,
1721 .rw_free_header = nfs_writehdr_free,
1722 .rw_release = nfs_writeback_release_common,
1723 .rw_done = nfs_writeback_done,
1724 .rw_result = nfs_writeback_result,
1725 .rw_initiate = nfs_initiate_write,
1726};