diff options
author | Trond Myklebust <Trond.Myklebust@netapp.com> | 2007-04-02 19:29:52 -0400 |
---|---|---|
committer | Trond Myklebust <Trond.Myklebust@netapp.com> | 2007-05-01 01:17:06 -0400 |
commit | c63c7b051395368573779c8309aa5c990dcf2f96 (patch) | |
tree | db54090eef99349d15b95fcd8c2620a2403d8db8 /fs/nfs/write.c | |
parent | 8b09bee3083897e375bd0bf9d60f48daedfab3e0 (diff) |
NFS: Fix a race when doing NFS write coalescing
Currently we do write coalescing in a very inefficient manner: one pass in
generic_writepages() in order to lock the pages for writing, then one pass
in nfs_flush_mapping() and/or nfs_sync_mapping_wait() in order to gather
the locked pages for coalescing into RPC requests of size "wsize".
In fact, it turns out there is actually a deadlock possible here since we
only start I/O on the second pass. If the user signals the process while
we're in nfs_sync_mapping_wait(), for instance, then we may exit before
starting I/O on all the requests that have been queued up.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs/write.c')
-rw-r--r-- | fs/nfs/write.c | 149 |
1 files changed, 47 insertions, 102 deletions
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b6749967eead..6ce2d94e7b3f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -38,7 +38,8 @@ | |||
38 | static struct nfs_page * nfs_update_request(struct nfs_open_context*, | 38 | static struct nfs_page * nfs_update_request(struct nfs_open_context*, |
39 | struct page *, | 39 | struct page *, |
40 | unsigned int, unsigned int); | 40 | unsigned int, unsigned int); |
41 | static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); | 41 | static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, |
42 | struct inode *inode, int ioflags); | ||
42 | static const struct rpc_call_ops nfs_write_partial_ops; | 43 | static const struct rpc_call_ops nfs_write_partial_ops; |
43 | static const struct rpc_call_ops nfs_write_full_ops; | 44 | static const struct rpc_call_ops nfs_write_full_ops; |
44 | static const struct rpc_call_ops nfs_commit_ops; | 45 | static const struct rpc_call_ops nfs_commit_ops; |
@@ -201,7 +202,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, | |||
201 | static int wb_priority(struct writeback_control *wbc) | 202 | static int wb_priority(struct writeback_control *wbc) |
202 | { | 203 | { |
203 | if (wbc->for_reclaim) | 204 | if (wbc->for_reclaim) |
204 | return FLUSH_HIGHPRI; | 205 | return FLUSH_HIGHPRI | FLUSH_STABLE; |
205 | if (wbc->for_kupdate) | 206 | if (wbc->for_kupdate) |
206 | return FLUSH_LOWPRI; | 207 | return FLUSH_LOWPRI; |
207 | return 0; | 208 | return 0; |
@@ -251,7 +252,8 @@ static void nfs_end_page_writeback(struct page *page) | |||
251 | * was not tagged. | 252 | * was not tagged. |
252 | * May also return an error if the user signalled nfs_wait_on_request(). | 253 | * May also return an error if the user signalled nfs_wait_on_request(). |
253 | */ | 254 | */ |
254 | static int nfs_page_mark_flush(struct page *page) | 255 | static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, |
256 | struct page *page) | ||
255 | { | 257 | { |
256 | struct nfs_page *req; | 258 | struct nfs_page *req; |
257 | struct nfs_inode *nfsi = NFS_I(page->mapping->host); | 259 | struct nfs_inode *nfsi = NFS_I(page->mapping->host); |
@@ -273,6 +275,8 @@ static int nfs_page_mark_flush(struct page *page) | |||
273 | * request as dirty (in which case we don't care). | 275 | * request as dirty (in which case we don't care). |
274 | */ | 276 | */ |
275 | spin_unlock(req_lock); | 277 | spin_unlock(req_lock); |
278 | /* Prevent deadlock! */ | ||
279 | nfs_pageio_complete(pgio); | ||
276 | ret = nfs_wait_on_request(req); | 280 | ret = nfs_wait_on_request(req); |
277 | nfs_release_request(req); | 281 | nfs_release_request(req); |
278 | if (ret != 0) | 282 | if (ret != 0) |
@@ -283,21 +287,18 @@ static int nfs_page_mark_flush(struct page *page) | |||
283 | /* This request is marked for commit */ | 287 | /* This request is marked for commit */ |
284 | spin_unlock(req_lock); | 288 | spin_unlock(req_lock); |
285 | nfs_unlock_request(req); | 289 | nfs_unlock_request(req); |
290 | nfs_pageio_complete(pgio); | ||
286 | return 1; | 291 | return 1; |
287 | } | 292 | } |
288 | if (nfs_set_page_writeback(page) == 0) { | 293 | if (nfs_set_page_writeback(page) != 0) { |
289 | nfs_list_remove_request(req); | ||
290 | /* add the request to the inode's dirty list. */ | ||
291 | radix_tree_tag_set(&nfsi->nfs_page_tree, | ||
292 | req->wb_index, NFS_PAGE_TAG_DIRTY); | ||
293 | nfs_list_add_request(req, &nfsi->dirty); | ||
294 | nfsi->ndirty++; | ||
295 | spin_unlock(req_lock); | ||
296 | __mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES); | ||
297 | } else | ||
298 | spin_unlock(req_lock); | 294 | spin_unlock(req_lock); |
295 | BUG(); | ||
296 | } | ||
297 | radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, | ||
298 | NFS_PAGE_TAG_WRITEBACK); | ||
299 | ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); | 299 | ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); |
300 | nfs_unlock_request(req); | 300 | spin_unlock(req_lock); |
301 | nfs_pageio_add_request(pgio, req); | ||
301 | return ret; | 302 | return ret; |
302 | } | 303 | } |
303 | 304 | ||
@@ -306,6 +307,7 @@ static int nfs_page_mark_flush(struct page *page) | |||
306 | */ | 307 | */ |
307 | static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) | 308 | static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) |
308 | { | 309 | { |
310 | struct nfs_pageio_descriptor mypgio, *pgio; | ||
309 | struct nfs_open_context *ctx; | 311 | struct nfs_open_context *ctx; |
310 | struct inode *inode = page->mapping->host; | 312 | struct inode *inode = page->mapping->host; |
311 | unsigned offset; | 313 | unsigned offset; |
@@ -314,7 +316,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc | |||
314 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); | 316 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); |
315 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); | 317 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); |
316 | 318 | ||
317 | err = nfs_page_mark_flush(page); | 319 | if (wbc->for_writepages) |
320 | pgio = wbc->fs_private; | ||
321 | else { | ||
322 | nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc)); | ||
323 | pgio = &mypgio; | ||
324 | } | ||
325 | |||
326 | err = nfs_page_async_flush(pgio, page); | ||
318 | if (err <= 0) | 327 | if (err <= 0) |
319 | goto out; | 328 | goto out; |
320 | err = 0; | 329 | err = 0; |
@@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc | |||
331 | put_nfs_open_context(ctx); | 340 | put_nfs_open_context(ctx); |
332 | if (err != 0) | 341 | if (err != 0) |
333 | goto out; | 342 | goto out; |
334 | err = nfs_page_mark_flush(page); | 343 | err = nfs_page_async_flush(pgio, page); |
335 | if (err > 0) | 344 | if (err > 0) |
336 | err = 0; | 345 | err = 0; |
337 | out: | 346 | out: |
338 | if (!wbc->for_writepages) | 347 | if (!wbc->for_writepages) |
339 | nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc)); | 348 | nfs_pageio_complete(pgio); |
340 | return err; | 349 | return err; |
341 | } | 350 | } |
342 | 351 | ||
@@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) | |||
352 | int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) | 361 | int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) |
353 | { | 362 | { |
354 | struct inode *inode = mapping->host; | 363 | struct inode *inode = mapping->host; |
364 | struct nfs_pageio_descriptor pgio; | ||
355 | int err; | 365 | int err; |
356 | 366 | ||
357 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); | 367 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); |
358 | 368 | ||
369 | nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); | ||
370 | wbc->fs_private = &pgio; | ||
359 | err = generic_writepages(mapping, wbc); | 371 | err = generic_writepages(mapping, wbc); |
372 | nfs_pageio_complete(&pgio); | ||
360 | if (err) | 373 | if (err) |
361 | return err; | 374 | return err; |
362 | err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc)); | 375 | if (pgio.pg_error) |
363 | if (err < 0) | 376 | return pgio.pg_error; |
364 | goto out; | 377 | return 0; |
365 | nfs_add_stats(inode, NFSIOS_WRITEPAGES, err); | ||
366 | err = 0; | ||
367 | out: | ||
368 | return err; | ||
369 | } | 378 | } |
370 | 379 | ||
371 | /* | 380 | /* |
@@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st | |||
536 | return res; | 545 | return res; |
537 | } | 546 | } |
538 | 547 | ||
539 | static void nfs_cancel_dirty_list(struct list_head *head) | ||
540 | { | ||
541 | struct nfs_page *req; | ||
542 | while(!list_empty(head)) { | ||
543 | req = nfs_list_entry(head->next); | ||
544 | nfs_list_remove_request(req); | ||
545 | nfs_end_page_writeback(req->wb_page); | ||
546 | nfs_inode_remove_request(req); | ||
547 | nfs_clear_page_writeback(req); | ||
548 | } | ||
549 | } | ||
550 | |||
551 | static void nfs_cancel_commit_list(struct list_head *head) | 548 | static void nfs_cancel_commit_list(struct list_head *head) |
552 | { | 549 | { |
553 | struct nfs_page *req; | 550 | struct nfs_page *req; |
@@ -936,33 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, size_t cou | |||
936 | return -ENOMEM; | 933 | return -ENOMEM; |
937 | } | 934 | } |
938 | 935 | ||
939 | static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how) | 936 | static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, |
937 | struct inode *inode, int ioflags) | ||
940 | { | 938 | { |
941 | struct nfs_pageio_descriptor desc; | ||
942 | int wpages = NFS_SERVER(inode)->wpages; | ||
943 | int wsize = NFS_SERVER(inode)->wsize; | 939 | int wsize = NFS_SERVER(inode)->wsize; |
944 | 940 | ||
945 | /* For single writes, FLUSH_STABLE is more efficient */ | ||
946 | if (npages <= wpages && npages == NFS_I(inode)->npages | ||
947 | && nfs_list_entry(head->next)->wb_bytes <= wsize) | ||
948 | how |= FLUSH_STABLE; | ||
949 | |||
950 | if (wsize < PAGE_CACHE_SIZE) | 941 | if (wsize < PAGE_CACHE_SIZE) |
951 | nfs_pageio_init(&desc, inode, nfs_flush_multi, wsize, how); | 942 | nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); |
952 | else | 943 | else |
953 | nfs_pageio_init(&desc, inode, nfs_flush_one, wsize, how); | 944 | nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); |
954 | nfs_pageio_add_list(&desc, head); | ||
955 | nfs_pageio_complete(&desc); | ||
956 | if (desc.pg_error == 0) | ||
957 | return 0; | ||
958 | while (!list_empty(head)) { | ||
959 | struct nfs_page *req = nfs_list_entry(head->next); | ||
960 | nfs_list_remove_request(req); | ||
961 | nfs_redirty_request(req); | ||
962 | nfs_end_page_writeback(req->wb_page); | ||
963 | nfs_clear_page_writeback(req); | ||
964 | } | ||
965 | return desc.pg_error; | ||
966 | } | 945 | } |
967 | 946 | ||
968 | /* | 947 | /* |
@@ -1286,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = { | |||
1286 | .rpc_call_done = nfs_commit_done, | 1265 | .rpc_call_done = nfs_commit_done, |
1287 | .rpc_release = nfs_commit_release, | 1266 | .rpc_release = nfs_commit_release, |
1288 | }; | 1267 | }; |
1289 | #else | ||
1290 | static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) | ||
1291 | { | ||
1292 | return 0; | ||
1293 | } | ||
1294 | #endif | ||
1295 | |||
1296 | static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) | ||
1297 | { | ||
1298 | struct nfs_inode *nfsi = NFS_I(mapping->host); | ||
1299 | LIST_HEAD(head); | ||
1300 | long res; | ||
1301 | |||
1302 | spin_lock(&nfsi->req_lock); | ||
1303 | res = nfs_scan_dirty(mapping, wbc, &head); | ||
1304 | spin_unlock(&nfsi->req_lock); | ||
1305 | if (res) { | ||
1306 | int error = nfs_flush_list(mapping->host, &head, res, how); | ||
1307 | if (error < 0) | ||
1308 | return error; | ||
1309 | } | ||
1310 | return res; | ||
1311 | } | ||
1312 | 1268 | ||
1313 | #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) | ||
1314 | int nfs_commit_inode(struct inode *inode, int how) | 1269 | int nfs_commit_inode(struct inode *inode, int how) |
1315 | { | 1270 | { |
1316 | struct nfs_inode *nfsi = NFS_I(inode); | 1271 | struct nfs_inode *nfsi = NFS_I(inode); |
@@ -1327,6 +1282,11 @@ int nfs_commit_inode(struct inode *inode, int how) | |||
1327 | } | 1282 | } |
1328 | return res; | 1283 | return res; |
1329 | } | 1284 | } |
1285 | #else | ||
1286 | static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) | ||
1287 | { | ||
1288 | return 0; | ||
1289 | } | ||
1330 | #endif | 1290 | #endif |
1331 | 1291 | ||
1332 | long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) | 1292 | long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) |
@@ -1360,19 +1320,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr | |||
1360 | ret = nfs_wait_on_requests_locked(inode, idx_start, npages); | 1320 | ret = nfs_wait_on_requests_locked(inode, idx_start, npages); |
1361 | if (ret != 0) | 1321 | if (ret != 0) |
1362 | continue; | 1322 | continue; |
1363 | pages = nfs_scan_dirty(mapping, wbc, &head); | ||
1364 | if (pages != 0) { | ||
1365 | spin_unlock(&nfsi->req_lock); | ||
1366 | if (how & FLUSH_INVALIDATE) { | ||
1367 | nfs_cancel_dirty_list(&head); | ||
1368 | ret = pages; | ||
1369 | } else | ||
1370 | ret = nfs_flush_list(inode, &head, pages, how); | ||
1371 | spin_lock(&nfsi->req_lock); | ||
1372 | continue; | ||
1373 | } | ||
1374 | if (wbc->pages_skipped != 0) | ||
1375 | continue; | ||
1376 | if (nocommit) | 1323 | if (nocommit) |
1377 | break; | 1324 | break; |
1378 | pages = nfs_scan_commit(inode, &head, idx_start, npages); | 1325 | pages = nfs_scan_commit(inode, &head, idx_start, npages); |
@@ -1412,7 +1359,7 @@ int nfs_wb_all(struct inode *inode) | |||
1412 | }; | 1359 | }; |
1413 | int ret; | 1360 | int ret; |
1414 | 1361 | ||
1415 | ret = generic_writepages(mapping, &wbc); | 1362 | ret = nfs_writepages(mapping, &wbc); |
1416 | if (ret < 0) | 1363 | if (ret < 0) |
1417 | goto out; | 1364 | goto out; |
1418 | ret = nfs_sync_mapping_wait(mapping, &wbc, 0); | 1365 | ret = nfs_sync_mapping_wait(mapping, &wbc, 0); |
@@ -1435,11 +1382,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo | |||
1435 | }; | 1382 | }; |
1436 | int ret; | 1383 | int ret; |
1437 | 1384 | ||
1438 | if (!(how & FLUSH_NOWRITEPAGE)) { | 1385 | ret = nfs_writepages(mapping, &wbc); |
1439 | ret = generic_writepages(mapping, &wbc); | 1386 | if (ret < 0) |
1440 | if (ret < 0) | 1387 | goto out; |
1441 | goto out; | ||
1442 | } | ||
1443 | ret = nfs_sync_mapping_wait(mapping, &wbc, how); | 1388 | ret = nfs_sync_mapping_wait(mapping, &wbc, how); |
1444 | if (ret >= 0) | 1389 | if (ret >= 0) |
1445 | return 0; | 1390 | return 0; |
@@ -1462,7 +1407,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how) | |||
1462 | int ret; | 1407 | int ret; |
1463 | 1408 | ||
1464 | BUG_ON(!PageLocked(page)); | 1409 | BUG_ON(!PageLocked(page)); |
1465 | if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) { | 1410 | if (clear_page_dirty_for_io(page)) { |
1466 | ret = nfs_writepage_locked(page, &wbc); | 1411 | ret = nfs_writepage_locked(page, &wbc); |
1467 | if (ret < 0) | 1412 | if (ret < 0) |
1468 | goto out; | 1413 | goto out; |