aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorWeston Andros Adamson <dros@primarydata.com>2014-05-15 11:56:45 -0400
committerTrond Myklebust <trond.myklebust@primarydata.com>2014-05-29 11:11:44 -0400
commit2bfc6e566daa8386c9cffef2f7de17fc330d3835 (patch)
treea615bb7091787ad574c5b31bcd6a30a5bfb8c2f9 /fs
parentab75e417192a486ffe63a314b6d2e7361f0e157f (diff)
nfs: add support for multiple nfs reqs per page
Add "page groups" - a circular list of nfs requests (struct nfs_page) that all reference the same page. This gives nfs read and write paths the ability to account for sub-page regions independently. This somewhat follows the design of struct buffer_head's sub-page accounting. Only "head" requests are ever added/removed from the inode list in the buffered write path. "head" and "sub" requests are treated the same through the read path and the rest of the write/commit path. Requests are given an extra reference across the life of the list. Page groups are never rejoined after being split. If the read/write request fails and the client falls back to another path (ie revert to MDS in PNFS case), the already split requests are pushed through the recoalescing code again, which may split them further and then coalesce them into properly sized requests on the wire. Fragmentation shouldn't be a problem with the current design, because we flush all requests in page group when a non-contiguous request is added, so the only time resplitting should occur is on a resend of a read or write. This patch lays the groundwork for sub-page splitting, but does not actually do any splitting. For now all page groups have one request as pg_test functions don't yet split pages. There are several related patches that are needed support multiple requests per page group. Signed-off-by: Weston Andros Adamson <dros@primarydata.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/pagelist.c220
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/write.c13
4 files changed, 225 insertions, 19 deletions
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1dd8c622d719..2c0e08f4cf71 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
380 struct nfs_page *req; 380 struct nfs_page *req;
381 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 381 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
382 /* XXX do we need to do the eof zeroing found in async_filler? */ 382 /* XXX do we need to do the eof zeroing found in async_filler? */
383 req = nfs_create_request(dreq->ctx, pagevec[i], 383 req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
384 pgbase, req_len); 384 pgbase, req_len);
385 if (IS_ERR(req)) { 385 if (IS_ERR(req)) {
386 result = PTR_ERR(req); 386 result = PTR_ERR(req);
@@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
749 struct nfs_page *req; 749 struct nfs_page *req;
750 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 750 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
751 751
752 req = nfs_create_request(dreq->ctx, pagevec[i], 752 req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
753 pgbase, req_len); 753 pgbase, req_len);
754 if (IS_ERR(req)) { 754 if (IS_ERR(req)) {
755 result = PTR_ERR(req); 755 result = PTR_ERR(req);
@@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
827 spin_unlock(&dreq->lock); 827 spin_unlock(&dreq->lock);
828 828
829 while (!list_empty(&hdr->pages)) { 829 while (!list_empty(&hdr->pages)) {
830 bool do_destroy = true;
831
830 req = nfs_list_entry(hdr->pages.next); 832 req = nfs_list_entry(hdr->pages.next);
831 nfs_list_remove_request(req); 833 nfs_list_remove_request(req);
832 switch (bit) { 834 switch (bit) {
@@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
834 case NFS_IOHDR_NEED_COMMIT: 836 case NFS_IOHDR_NEED_COMMIT:
835 kref_get(&req->wb_kref); 837 kref_get(&req->wb_kref);
836 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 838 nfs_mark_request_commit(req, hdr->lseg, &cinfo);
839 do_destroy = false;
837 } 840 }
838 nfs_unlock_and_release_request(req); 841 nfs_unlock_and_release_request(req);
839 } 842 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f343f49ff596..015fb7b48dfe 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -29,6 +29,8 @@
29static struct kmem_cache *nfs_page_cachep; 29static struct kmem_cache *nfs_page_cachep;
30static const struct rpc_call_ops nfs_pgio_common_ops; 30static const struct rpc_call_ops nfs_pgio_common_ops;
31 31
32static void nfs_free_request(struct nfs_page *);
33
32static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) 34static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
33{ 35{
34 p->npages = pagecount; 36 p->npages = pagecount;
@@ -136,10 +138,151 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
136 return __nfs_iocounter_wait(c); 138 return __nfs_iocounter_wait(c);
137} 139}
138 140
141/*
142 * nfs_page_group_lock - lock the head of the page group
143 * @req - request in group that is to be locked
144 *
145 * this lock must be held if modifying the page group list
146 */
147void
148nfs_page_group_lock(struct nfs_page *req)
149{
150 struct nfs_page *head = req->wb_head;
151 int err = -EAGAIN;
152
153 WARN_ON_ONCE(head != head->wb_head);
154
155 while (err)
156 err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
157 nfs_wait_bit_killable, TASK_KILLABLE);
158}
159
160/*
161 * nfs_page_group_unlock - unlock the head of the page group
162 * @req - request in group that is to be unlocked
163 */
164void
165nfs_page_group_unlock(struct nfs_page *req)
166{
167 struct nfs_page *head = req->wb_head;
168
169 WARN_ON_ONCE(head != head->wb_head);
170
171 smp_mb__before_clear_bit();
172 clear_bit(PG_HEADLOCK, &head->wb_flags);
173 smp_mb__after_clear_bit();
174 wake_up_bit(&head->wb_flags, PG_HEADLOCK);
175}
176
177/*
178 * nfs_page_group_sync_on_bit_locked
179 *
180 * must be called with page group lock held
181 */
182static bool
183nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
184{
185 struct nfs_page *head = req->wb_head;
186 struct nfs_page *tmp;
187
188 WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
189 WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
190
191 tmp = req->wb_this_page;
192 while (tmp != req) {
193 if (!test_bit(bit, &tmp->wb_flags))
194 return false;
195 tmp = tmp->wb_this_page;
196 }
197
198 /* true! reset all bits */
199 tmp = req;
200 do {
201 clear_bit(bit, &tmp->wb_flags);
202 tmp = tmp->wb_this_page;
203 } while (tmp != req);
204
205 return true;
206}
207
208/*
209 * nfs_page_group_sync_on_bit - set bit on current request, but only
210 * return true if the bit is set for all requests in page group
211 * @req - request in page group
212 * @bit - PG_* bit that is used to sync page group
213 */
214bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
215{
216 bool ret;
217
218 nfs_page_group_lock(req);
219 ret = nfs_page_group_sync_on_bit_locked(req, bit);
220 nfs_page_group_unlock(req);
221
222 return ret;
223}
224
225/*
226 * nfs_page_group_init - Initialize the page group linkage for @req
227 * @req - a new nfs request
228 * @prev - the previous request in page group, or NULL if @req is the first
229 * or only request in the group (the head).
230 */
231static inline void
232nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
233{
234 WARN_ON_ONCE(prev == req);
235
236 if (!prev) {
237 req->wb_head = req;
238 req->wb_this_page = req;
239 } else {
240 WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
241 WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
242 req->wb_head = prev->wb_head;
243 req->wb_this_page = prev->wb_this_page;
244 prev->wb_this_page = req;
245
246 /* grab extra ref if head request has extra ref from
247 * the write/commit path to handle handoff between write
248 * and commit lists */
249 if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags))
250 kref_get(&req->wb_kref);
251 }
252}
253
254/*
255 * nfs_page_group_destroy - sync the destruction of page groups
256 * @req - request that no longer needs the page group
257 *
258 * releases the page group reference from each member once all
259 * members have called this function.
260 */
261static void
262nfs_page_group_destroy(struct kref *kref)
263{
264 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
265 struct nfs_page *tmp, *next;
266
267 if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
268 return;
269
270 tmp = req;
271 do {
272 next = tmp->wb_this_page;
273 /* unlink and free */
274 tmp->wb_this_page = tmp;
275 tmp->wb_head = tmp;
276 nfs_free_request(tmp);
277 tmp = next;
278 } while (tmp != req);
279}
280
139/** 281/**
140 * nfs_create_request - Create an NFS read/write request. 282 * nfs_create_request - Create an NFS read/write request.
141 * @ctx: open context to use 283 * @ctx: open context to use
142 * @page: page to write 284 * @page: page to write
285 * @last: last nfs request created for this page group or NULL if head
143 * @offset: starting offset within the page for the write 286 * @offset: starting offset within the page for the write
144 * @count: number of bytes to read/write 287 * @count: number of bytes to read/write
145 * 288 *
@@ -149,7 +292,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
149 */ 292 */
150struct nfs_page * 293struct nfs_page *
151nfs_create_request(struct nfs_open_context *ctx, struct page *page, 294nfs_create_request(struct nfs_open_context *ctx, struct page *page,
152 unsigned int offset, unsigned int count) 295 struct nfs_page *last, unsigned int offset,
296 unsigned int count)
153{ 297{
154 struct nfs_page *req; 298 struct nfs_page *req;
155 struct nfs_lock_context *l_ctx; 299 struct nfs_lock_context *l_ctx;
@@ -181,6 +325,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
181 req->wb_bytes = count; 325 req->wb_bytes = count;
182 req->wb_context = get_nfs_open_context(ctx); 326 req->wb_context = get_nfs_open_context(ctx);
183 kref_init(&req->wb_kref); 327 kref_init(&req->wb_kref);
328 nfs_page_group_init(req, last);
184 return req; 329 return req;
185} 330}
186 331
@@ -238,16 +383,18 @@ static void nfs_clear_request(struct nfs_page *req)
238 } 383 }
239} 384}
240 385
241
242/** 386/**
243 * nfs_release_request - Release the count on an NFS read/write request 387 * nfs_release_request - Release the count on an NFS read/write request
244 * @req: request to release 388 * @req: request to release
245 * 389 *
246 * Note: Should never be called with the spinlock held! 390 * Note: Should never be called with the spinlock held!
247 */ 391 */
248static void nfs_free_request(struct kref *kref) 392static void nfs_free_request(struct nfs_page *req)
249{ 393{
250 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 394 WARN_ON_ONCE(req->wb_this_page != req);
395
396 /* extra debug: make sure no sync bits are still set */
397 WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
251 398
252 /* Release struct file and open context */ 399 /* Release struct file and open context */
253 nfs_clear_request(req); 400 nfs_clear_request(req);
@@ -256,7 +403,7 @@ static void nfs_free_request(struct kref *kref)
256 403
257void nfs_release_request(struct nfs_page *req) 404void nfs_release_request(struct nfs_page *req)
258{ 405{
259 kref_put(&req->wb_kref, nfs_free_request); 406 kref_put(&req->wb_kref, nfs_page_group_destroy);
260} 407}
261 408
262static int nfs_wait_bit_uninterruptible(void *word) 409static int nfs_wait_bit_uninterruptible(void *word)
@@ -832,21 +979,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
832 * @desc: destination io descriptor 979 * @desc: destination io descriptor
833 * @req: request 980 * @req: request
834 * 981 *
982 * This may split a request into subrequests which are all part of the
983 * same page group.
984 *
835 * Returns true if the request 'req' was successfully coalesced into the 985 * Returns true if the request 'req' was successfully coalesced into the
836 * existing list of pages 'desc'. 986 * existing list of pages 'desc'.
837 */ 987 */
838static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 988static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
839 struct nfs_page *req) 989 struct nfs_page *req)
840{ 990{
841 while (!nfs_pageio_do_add_request(desc, req)) { 991 struct nfs_page *subreq;
842 desc->pg_moreio = 1; 992 unsigned int bytes_left = 0;
843 nfs_pageio_doio(desc); 993 unsigned int offset, pgbase;
844 if (desc->pg_error < 0) 994
845 return 0; 995 nfs_page_group_lock(req);
846 desc->pg_moreio = 0; 996
847 if (desc->pg_recoalesce) 997 subreq = req;
848 return 0; 998 bytes_left = subreq->wb_bytes;
849 } 999 offset = subreq->wb_offset;
1000 pgbase = subreq->wb_pgbase;
1001
1002 do {
1003 if (!nfs_pageio_do_add_request(desc, subreq)) {
1004 /* make sure pg_test call(s) did nothing */
1005 WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
1006 WARN_ON_ONCE(subreq->wb_offset != offset);
1007 WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
1008
1009 nfs_page_group_unlock(req);
1010 desc->pg_moreio = 1;
1011 nfs_pageio_doio(desc);
1012 if (desc->pg_error < 0)
1013 return 0;
1014 desc->pg_moreio = 0;
1015 if (desc->pg_recoalesce)
1016 return 0;
1017 /* retry add_request for this subreq */
1018 nfs_page_group_lock(req);
1019 continue;
1020 }
1021
1022 /* check for buggy pg_test call(s) */
1023 WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
1024 WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
1025 WARN_ON_ONCE(subreq->wb_bytes == 0);
1026
1027 bytes_left -= subreq->wb_bytes;
1028 offset += subreq->wb_bytes;
1029 pgbase += subreq->wb_bytes;
1030
1031 if (bytes_left) {
1032 subreq = nfs_create_request(req->wb_context,
1033 req->wb_page,
1034 subreq, pgbase, bytes_left);
1035 nfs_lock_request(subreq);
1036 subreq->wb_offset = offset;
1037 subreq->wb_index = req->wb_index;
1038 }
1039 } while (bytes_left > 0);
1040
1041 nfs_page_group_unlock(req);
850 return 1; 1042 return 1;
851} 1043}
852 1044
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 46d90448f69b..902ba2c63d05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
85 len = nfs_page_length(page); 85 len = nfs_page_length(page);
86 if (len == 0) 86 if (len == 0)
87 return nfs_return_empty_page(page); 87 return nfs_return_empty_page(page);
88 new = nfs_create_request(ctx, page, 0, len); 88 new = nfs_create_request(ctx, page, NULL, 0, len);
89 if (IS_ERR(new)) { 89 if (IS_ERR(new)) {
90 unlock_page(page); 90 unlock_page(page);
91 return PTR_ERR(new); 91 return PTR_ERR(new);
@@ -311,7 +311,7 @@ readpage_async_filler(void *data, struct page *page)
311 if (len == 0) 311 if (len == 0)
312 return nfs_return_empty_page(page); 312 return nfs_return_empty_page(page);
313 313
314 new = nfs_create_request(desc->ctx, page, 0, len); 314 new = nfs_create_request(desc->ctx, page, NULL, 0, len);
315 if (IS_ERR(new)) 315 if (IS_ERR(new))
316 goto out_error; 316 goto out_error;
317 317
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e773df207c05..d0f30f12a8b3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -367,6 +367,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
367{ 367{
368 struct nfs_inode *nfsi = NFS_I(inode); 368 struct nfs_inode *nfsi = NFS_I(inode);
369 369
370 WARN_ON_ONCE(req->wb_this_page != req);
371
370 /* Lock the request! */ 372 /* Lock the request! */
371 nfs_lock_request(req); 373 nfs_lock_request(req);
372 374
@@ -383,6 +385,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
383 set_page_private(req->wb_page, (unsigned long)req); 385 set_page_private(req->wb_page, (unsigned long)req);
384 } 386 }
385 nfsi->npages++; 387 nfsi->npages++;
388 set_bit(PG_INODE_REF, &req->wb_flags);
386 kref_get(&req->wb_kref); 389 kref_get(&req->wb_kref);
387 spin_unlock(&inode->i_lock); 390 spin_unlock(&inode->i_lock);
388} 391}
@@ -567,6 +570,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
567{ 570{
568 struct nfs_commit_info cinfo; 571 struct nfs_commit_info cinfo;
569 unsigned long bytes = 0; 572 unsigned long bytes = 0;
573 bool do_destroy;
570 574
571 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) 575 if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
572 goto out; 576 goto out;
@@ -596,6 +600,7 @@ remove_req:
596next: 600next:
597 nfs_unlock_request(req); 601 nfs_unlock_request(req);
598 nfs_end_page_writeback(req->wb_page); 602 nfs_end_page_writeback(req->wb_page);
603 do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
599 nfs_release_request(req); 604 nfs_release_request(req);
600 } 605 }
601out: 606out:
@@ -700,6 +705,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
700 if (req == NULL) 705 if (req == NULL)
701 goto out_unlock; 706 goto out_unlock;
702 707
708 /* should be handled by nfs_flush_incompatible */
709 WARN_ON_ONCE(req->wb_head != req);
710 WARN_ON_ONCE(req->wb_this_page != req);
711
703 rqend = req->wb_offset + req->wb_bytes; 712 rqend = req->wb_offset + req->wb_bytes;
704 /* 713 /*
705 * Tell the caller to flush out the request if 714 * Tell the caller to flush out the request if
@@ -761,7 +770,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
761 req = nfs_try_to_update_request(inode, page, offset, bytes); 770 req = nfs_try_to_update_request(inode, page, offset, bytes);
762 if (req != NULL) 771 if (req != NULL)
763 goto out; 772 goto out;
764 req = nfs_create_request(ctx, page, offset, bytes); 773 req = nfs_create_request(ctx, page, NULL, offset, bytes);
765 if (IS_ERR(req)) 774 if (IS_ERR(req))
766 goto out; 775 goto out;
767 nfs_inode_add_request(inode, req); 776 nfs_inode_add_request(inode, req);
@@ -805,6 +814,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
805 return 0; 814 return 0;
806 l_ctx = req->wb_lock_context; 815 l_ctx = req->wb_lock_context;
807 do_flush = req->wb_page != page || req->wb_context != ctx; 816 do_flush = req->wb_page != page || req->wb_context != ctx;
817 /* for now, flush if more than 1 request in page_group */
818 do_flush |= req->wb_this_page != req;
808 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { 819 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
809 do_flush |= l_ctx->lockowner.l_owner != current->files 820 do_flush |= l_ctx->lockowner.l_owner != current->files
810 || l_ctx->lockowner.l_pid != current->tgid; 821 || l_ctx->lockowner.l_pid != current->tgid;