aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2007-04-02 19:29:52 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2007-05-01 01:17:06 -0400
commitc63c7b051395368573779c8309aa5c990dcf2f96 (patch)
treedb54090eef99349d15b95fcd8c2620a2403d8db8 /fs
parent8b09bee3083897e375bd0bf9d60f48daedfab3e0 (diff)
NFS: Fix a race when doing NFS write coalescing
Currently we do write coalescing in a very inefficient manner: one pass in generic_writepages() in order to lock the pages for writing, then one pass in nfs_flush_mapping() and/or nfs_sync_mapping_wait() in order to gather the locked pages for coalescing into RPC requests of size "wsize". In fact, it turns out there is actually a deadlock possible here since we only start I/O on the second pass. If the user signals the process while we're in nfs_sync_mapping_wait(), for instance, then we may exit before starting I/O on all the requests that have been queued up. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/nfs/pagelist.c92
-rw-r--r--fs/nfs/write.c149
2 files changed, 47 insertions, 194 deletions
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 094537ddd344..ea1a85df9ab1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -17,7 +17,6 @@
17#include <linux/nfs_page.h> 17#include <linux/nfs_page.h>
18#include <linux/nfs_fs.h> 18#include <linux/nfs_fs.h>
19#include <linux/nfs_mount.h> 19#include <linux/nfs_mount.h>
20#include <linux/writeback.h>
21 20
22#define NFS_PARANOIA 1 21#define NFS_PARANOIA 1
23 22
@@ -354,25 +353,6 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
354} 353}
355 354
356/** 355/**
357 * nfs_pageio_add_list - Split coalesced requests out from a list.
358 * @desc: destination io descriptor
359 * @head: source list
360 *
361 * Moves a maximum of 'nmax' elements from one list to another.
362 * The elements are checked to ensure that they form a contiguous set
363 * of pages, and that the RPC credentials are the same.
364 */
365void nfs_pageio_add_list(struct nfs_pageio_descriptor *desc,
366 struct list_head *head)
367{
368 while (!list_empty(head)) {
369 struct nfs_page *req = nfs_list_entry(head->next);
370 if (!nfs_pageio_add_request(desc, req))
371 break;
372 }
373}
374
375/**
376 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 356 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
377 * @desc: pointer to io descriptor 357 * @desc: pointer to io descriptor
378 */ 358 */
@@ -383,78 +363,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
383 363
384#define NFS_SCAN_MAXENTRIES 16 364#define NFS_SCAN_MAXENTRIES 16
385/** 365/**
386 * nfs_scan_dirty - Scan the radix tree for dirty requests
387 * @mapping: pointer to address space
388 * @wbc: writeback_control structure
389 * @dst: Destination list
390 *
391 * Moves elements from one of the inode request lists.
392 * If the number of requests is set to 0, the entire address_space
393 * starting at index idx_start, is scanned.
394 * The requests are *not* checked to ensure that they form a contiguous set.
395 * You must be holding the inode's req_lock when calling this function
396 */
397long nfs_scan_dirty(struct address_space *mapping,
398 struct writeback_control *wbc,
399 struct list_head *dst)
400{
401 struct nfs_inode *nfsi = NFS_I(mapping->host);
402 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
403 struct nfs_page *req;
404 pgoff_t idx_start, idx_end;
405 long res = 0;
406 int found, i;
407
408 if (nfsi->ndirty == 0)
409 return 0;
410 if (wbc->range_cyclic) {
411 idx_start = 0;
412 idx_end = ULONG_MAX;
413 } else if (wbc->range_end == 0) {
414 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
415 idx_end = ULONG_MAX;
416 } else {
417 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
418 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
419 }
420
421 for (;;) {
422 unsigned int toscan = NFS_SCAN_MAXENTRIES;
423
424 found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
425 (void **)&pgvec[0], idx_start, toscan,
426 NFS_PAGE_TAG_DIRTY);
427
428 /* Did we make progress? */
429 if (found <= 0)
430 break;
431
432 for (i = 0; i < found; i++) {
433 req = pgvec[i];
434 if (!wbc->range_cyclic && req->wb_index > idx_end)
435 goto out;
436
437 /* Try to lock request and mark it for writeback */
438 if (!nfs_set_page_writeback_locked(req))
439 goto next;
440 radix_tree_tag_clear(&nfsi->nfs_page_tree,
441 req->wb_index, NFS_PAGE_TAG_DIRTY);
442 nfsi->ndirty--;
443 nfs_list_remove_request(req);
444 nfs_list_add_request(req, dst);
445 res++;
446 if (res == LONG_MAX)
447 goto out;
448next:
449 idx_start = req->wb_index + 1;
450 }
451 }
452out:
453 WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
454 return res;
455}
456
457/**
458 * nfs_scan_list - Scan a list for matching requests 366 * nfs_scan_list - Scan a list for matching requests
459 * @nfsi: NFS inode 367 * @nfsi: NFS inode
460 * @head: One of the NFS inode request lists 368 * @head: One of the NFS inode request lists
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b6749967eead..6ce2d94e7b3f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -38,7 +38,8 @@
38static struct nfs_page * nfs_update_request(struct nfs_open_context*, 38static struct nfs_page * nfs_update_request(struct nfs_open_context*,
39 struct page *, 39 struct page *,
40 unsigned int, unsigned int); 40 unsigned int, unsigned int);
41static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); 41static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
42 struct inode *inode, int ioflags);
42static const struct rpc_call_ops nfs_write_partial_ops; 43static const struct rpc_call_ops nfs_write_partial_ops;
43static const struct rpc_call_ops nfs_write_full_ops; 44static const struct rpc_call_ops nfs_write_full_ops;
44static const struct rpc_call_ops nfs_commit_ops; 45static const struct rpc_call_ops nfs_commit_ops;
@@ -201,7 +202,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
201static int wb_priority(struct writeback_control *wbc) 202static int wb_priority(struct writeback_control *wbc)
202{ 203{
203 if (wbc->for_reclaim) 204 if (wbc->for_reclaim)
204 return FLUSH_HIGHPRI; 205 return FLUSH_HIGHPRI | FLUSH_STABLE;
205 if (wbc->for_kupdate) 206 if (wbc->for_kupdate)
206 return FLUSH_LOWPRI; 207 return FLUSH_LOWPRI;
207 return 0; 208 return 0;
@@ -251,7 +252,8 @@ static void nfs_end_page_writeback(struct page *page)
251 * was not tagged. 252 * was not tagged.
252 * May also return an error if the user signalled nfs_wait_on_request(). 253 * May also return an error if the user signalled nfs_wait_on_request().
253 */ 254 */
254static int nfs_page_mark_flush(struct page *page) 255static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
256 struct page *page)
255{ 257{
256 struct nfs_page *req; 258 struct nfs_page *req;
257 struct nfs_inode *nfsi = NFS_I(page->mapping->host); 259 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
@@ -273,6 +275,8 @@ static int nfs_page_mark_flush(struct page *page)
273 * request as dirty (in which case we don't care). 275 * request as dirty (in which case we don't care).
274 */ 276 */
275 spin_unlock(req_lock); 277 spin_unlock(req_lock);
278 /* Prevent deadlock! */
279 nfs_pageio_complete(pgio);
276 ret = nfs_wait_on_request(req); 280 ret = nfs_wait_on_request(req);
277 nfs_release_request(req); 281 nfs_release_request(req);
278 if (ret != 0) 282 if (ret != 0)
@@ -283,21 +287,18 @@ static int nfs_page_mark_flush(struct page *page)
283 /* This request is marked for commit */ 287 /* This request is marked for commit */
284 spin_unlock(req_lock); 288 spin_unlock(req_lock);
285 nfs_unlock_request(req); 289 nfs_unlock_request(req);
290 nfs_pageio_complete(pgio);
286 return 1; 291 return 1;
287 } 292 }
288 if (nfs_set_page_writeback(page) == 0) { 293 if (nfs_set_page_writeback(page) != 0) {
289 nfs_list_remove_request(req);
290 /* add the request to the inode's dirty list. */
291 radix_tree_tag_set(&nfsi->nfs_page_tree,
292 req->wb_index, NFS_PAGE_TAG_DIRTY);
293 nfs_list_add_request(req, &nfsi->dirty);
294 nfsi->ndirty++;
295 spin_unlock(req_lock);
296 __mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES);
297 } else
298 spin_unlock(req_lock); 294 spin_unlock(req_lock);
295 BUG();
296 }
297 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
298 NFS_PAGE_TAG_WRITEBACK);
299 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); 299 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
300 nfs_unlock_request(req); 300 spin_unlock(req_lock);
301 nfs_pageio_add_request(pgio, req);
301 return ret; 302 return ret;
302} 303}
303 304
@@ -306,6 +307,7 @@ static int nfs_page_mark_flush(struct page *page)
306 */ 307 */
307static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) 308static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
308{ 309{
310 struct nfs_pageio_descriptor mypgio, *pgio;
309 struct nfs_open_context *ctx; 311 struct nfs_open_context *ctx;
310 struct inode *inode = page->mapping->host; 312 struct inode *inode = page->mapping->host;
311 unsigned offset; 313 unsigned offset;
@@ -314,7 +316,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
314 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 316 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
315 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 317 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
316 318
317 err = nfs_page_mark_flush(page); 319 if (wbc->for_writepages)
320 pgio = wbc->fs_private;
321 else {
322 nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
323 pgio = &mypgio;
324 }
325
326 err = nfs_page_async_flush(pgio, page);
318 if (err <= 0) 327 if (err <= 0)
319 goto out; 328 goto out;
320 err = 0; 329 err = 0;
@@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
331 put_nfs_open_context(ctx); 340 put_nfs_open_context(ctx);
332 if (err != 0) 341 if (err != 0)
333 goto out; 342 goto out;
334 err = nfs_page_mark_flush(page); 343 err = nfs_page_async_flush(pgio, page);
335 if (err > 0) 344 if (err > 0)
336 err = 0; 345 err = 0;
337out: 346out:
338 if (!wbc->for_writepages) 347 if (!wbc->for_writepages)
339 nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc)); 348 nfs_pageio_complete(pgio);
340 return err; 349 return err;
341} 350}
342 351
@@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
352int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 361int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
353{ 362{
354 struct inode *inode = mapping->host; 363 struct inode *inode = mapping->host;
364 struct nfs_pageio_descriptor pgio;
355 int err; 365 int err;
356 366
357 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 367 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
358 368
369 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
370 wbc->fs_private = &pgio;
359 err = generic_writepages(mapping, wbc); 371 err = generic_writepages(mapping, wbc);
372 nfs_pageio_complete(&pgio);
360 if (err) 373 if (err)
361 return err; 374 return err;
362 err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc)); 375 if (pgio.pg_error)
363 if (err < 0) 376 return pgio.pg_error;
364 goto out; 377 return 0;
365 nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
366 err = 0;
367out:
368 return err;
369} 378}
370 379
371/* 380/*
@@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
536 return res; 545 return res;
537} 546}
538 547
539static void nfs_cancel_dirty_list(struct list_head *head)
540{
541 struct nfs_page *req;
542 while(!list_empty(head)) {
543 req = nfs_list_entry(head->next);
544 nfs_list_remove_request(req);
545 nfs_end_page_writeback(req->wb_page);
546 nfs_inode_remove_request(req);
547 nfs_clear_page_writeback(req);
548 }
549}
550
551static void nfs_cancel_commit_list(struct list_head *head) 548static void nfs_cancel_commit_list(struct list_head *head)
552{ 549{
553 struct nfs_page *req; 550 struct nfs_page *req;
@@ -936,33 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, size_t cou
936 return -ENOMEM; 933 return -ENOMEM;
937} 934}
938 935
939static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how) 936static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
937 struct inode *inode, int ioflags)
940{ 938{
941 struct nfs_pageio_descriptor desc;
942 int wpages = NFS_SERVER(inode)->wpages;
943 int wsize = NFS_SERVER(inode)->wsize; 939 int wsize = NFS_SERVER(inode)->wsize;
944 940
945 /* For single writes, FLUSH_STABLE is more efficient */
946 if (npages <= wpages && npages == NFS_I(inode)->npages
947 && nfs_list_entry(head->next)->wb_bytes <= wsize)
948 how |= FLUSH_STABLE;
949
950 if (wsize < PAGE_CACHE_SIZE) 941 if (wsize < PAGE_CACHE_SIZE)
951 nfs_pageio_init(&desc, inode, nfs_flush_multi, wsize, how); 942 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
952 else 943 else
953 nfs_pageio_init(&desc, inode, nfs_flush_one, wsize, how); 944 nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
954 nfs_pageio_add_list(&desc, head);
955 nfs_pageio_complete(&desc);
956 if (desc.pg_error == 0)
957 return 0;
958 while (!list_empty(head)) {
959 struct nfs_page *req = nfs_list_entry(head->next);
960 nfs_list_remove_request(req);
961 nfs_redirty_request(req);
962 nfs_end_page_writeback(req->wb_page);
963 nfs_clear_page_writeback(req);
964 }
965 return desc.pg_error;
966} 945}
967 946
968/* 947/*
@@ -1286,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
1286 .rpc_call_done = nfs_commit_done, 1265 .rpc_call_done = nfs_commit_done,
1287 .rpc_release = nfs_commit_release, 1266 .rpc_release = nfs_commit_release,
1288}; 1267};
1289#else
1290static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1291{
1292 return 0;
1293}
1294#endif
1295
1296static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
1297{
1298 struct nfs_inode *nfsi = NFS_I(mapping->host);
1299 LIST_HEAD(head);
1300 long res;
1301
1302 spin_lock(&nfsi->req_lock);
1303 res = nfs_scan_dirty(mapping, wbc, &head);
1304 spin_unlock(&nfsi->req_lock);
1305 if (res) {
1306 int error = nfs_flush_list(mapping->host, &head, res, how);
1307 if (error < 0)
1308 return error;
1309 }
1310 return res;
1311}
1312 1268
1313#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1314int nfs_commit_inode(struct inode *inode, int how) 1269int nfs_commit_inode(struct inode *inode, int how)
1315{ 1270{
1316 struct nfs_inode *nfsi = NFS_I(inode); 1271 struct nfs_inode *nfsi = NFS_I(inode);
@@ -1327,6 +1282,11 @@ int nfs_commit_inode(struct inode *inode, int how)
1327 } 1282 }
1328 return res; 1283 return res;
1329} 1284}
1285#else
1286static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1287{
1288 return 0;
1289}
1330#endif 1290#endif
1331 1291
1332long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1292long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
@@ -1360,19 +1320,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
1360 ret = nfs_wait_on_requests_locked(inode, idx_start, npages); 1320 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1361 if (ret != 0) 1321 if (ret != 0)
1362 continue; 1322 continue;
1363 pages = nfs_scan_dirty(mapping, wbc, &head);
1364 if (pages != 0) {
1365 spin_unlock(&nfsi->req_lock);
1366 if (how & FLUSH_INVALIDATE) {
1367 nfs_cancel_dirty_list(&head);
1368 ret = pages;
1369 } else
1370 ret = nfs_flush_list(inode, &head, pages, how);
1371 spin_lock(&nfsi->req_lock);
1372 continue;
1373 }
1374 if (wbc->pages_skipped != 0)
1375 continue;
1376 if (nocommit) 1323 if (nocommit)
1377 break; 1324 break;
1378 pages = nfs_scan_commit(inode, &head, idx_start, npages); 1325 pages = nfs_scan_commit(inode, &head, idx_start, npages);
@@ -1412,7 +1359,7 @@ int nfs_wb_all(struct inode *inode)
1412 }; 1359 };
1413 int ret; 1360 int ret;
1414 1361
1415 ret = generic_writepages(mapping, &wbc); 1362 ret = nfs_writepages(mapping, &wbc);
1416 if (ret < 0) 1363 if (ret < 0)
1417 goto out; 1364 goto out;
1418 ret = nfs_sync_mapping_wait(mapping, &wbc, 0); 1365 ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
@@ -1435,11 +1382,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
1435 }; 1382 };
1436 int ret; 1383 int ret;
1437 1384
1438 if (!(how & FLUSH_NOWRITEPAGE)) { 1385 ret = nfs_writepages(mapping, &wbc);
1439 ret = generic_writepages(mapping, &wbc); 1386 if (ret < 0)
1440 if (ret < 0) 1387 goto out;
1441 goto out;
1442 }
1443 ret = nfs_sync_mapping_wait(mapping, &wbc, how); 1388 ret = nfs_sync_mapping_wait(mapping, &wbc, how);
1444 if (ret >= 0) 1389 if (ret >= 0)
1445 return 0; 1390 return 0;
@@ -1462,7 +1407,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
1462 int ret; 1407 int ret;
1463 1408
1464 BUG_ON(!PageLocked(page)); 1409 BUG_ON(!PageLocked(page));
1465 if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) { 1410 if (clear_page_dirty_for_io(page)) {
1466 ret = nfs_writepage_locked(page, &wbc); 1411 ret = nfs_writepage_locked(page, &wbc);
1467 if (ret < 0) 1412 if (ret < 0)
1468 goto out; 1413 goto out;