aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c1027
1 files changed, 382 insertions, 645 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a9ebabfe7587..d445a64b979e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/writeback.h> 37#include <linux/writeback.h>
38 38
39/* flags for direct write completions */
40#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
41#define XFS_DIO_FLAG_APPEND (1 << 1)
42
43/*
44 * structure owned by writepages passed to individual writepage calls
45 */
46struct xfs_writepage_ctx {
47 struct xfs_bmbt_irec imap;
48 bool imap_valid;
49 unsigned int io_type;
50 struct xfs_ioend *ioend;
51 sector_t last_block;
52};
53
39void 54void
40xfs_count_page_state( 55xfs_count_page_state(
41 struct page *page, 56 struct page *page,
@@ -214,10 +229,12 @@ xfs_end_io(
214 struct xfs_inode *ip = XFS_I(ioend->io_inode); 229 struct xfs_inode *ip = XFS_I(ioend->io_inode);
215 int error = 0; 230 int error = 0;
216 231
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 232 /*
233 * Set an error if the mount has shut down and proceed with end I/O
234 * processing so it can perform whatever cleanups are necessary.
235 */
236 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 ioend->io_error = -EIO; 237 ioend->io_error = -EIO;
219 goto done;
220 }
221 238
222 /* 239 /*
223 * For unwritten extents we need to issue transactions to convert a 240 * For unwritten extents we need to issue transactions to convert a
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
265 */ 282 */
266 atomic_set(&ioend->io_remaining, 1); 283 atomic_set(&ioend->io_remaining, 1);
267 ioend->io_error = 0; 284 ioend->io_error = 0;
268 ioend->io_list = NULL; 285 INIT_LIST_HEAD(&ioend->io_list);
269 ioend->io_type = type; 286 ioend->io_type = type;
270 ioend->io_inode = inode; 287 ioend->io_inode = inode;
271 ioend->io_buffer_head = NULL; 288 ioend->io_buffer_head = NULL;
@@ -283,8 +300,7 @@ xfs_map_blocks(
283 struct inode *inode, 300 struct inode *inode,
284 loff_t offset, 301 loff_t offset,
285 struct xfs_bmbt_irec *imap, 302 struct xfs_bmbt_irec *imap,
286 int type, 303 int type)
287 int nonblocking)
288{ 304{
289 struct xfs_inode *ip = XFS_I(inode); 305 struct xfs_inode *ip = XFS_I(inode);
290 struct xfs_mount *mp = ip->i_mount; 306 struct xfs_mount *mp = ip->i_mount;
@@ -300,12 +316,7 @@ xfs_map_blocks(
300 if (type == XFS_IO_UNWRITTEN) 316 if (type == XFS_IO_UNWRITTEN)
301 bmapi_flags |= XFS_BMAPI_IGSTATE; 317 bmapi_flags |= XFS_BMAPI_IGSTATE;
302 318
303 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 319 xfs_ilock(ip, XFS_ILOCK_SHARED);
304 if (nonblocking)
305 return -EAGAIN;
306 xfs_ilock(ip, XFS_ILOCK_SHARED);
307 }
308
309 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 320 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
310 (ip->i_df.if_flags & XFS_IFEXTENTS)); 321 (ip->i_df.if_flags & XFS_IFEXTENTS));
311 ASSERT(offset <= mp->m_super->s_maxbytes); 322 ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -341,7 +352,7 @@ xfs_map_blocks(
341 return 0; 352 return 0;
342} 353}
343 354
344STATIC int 355STATIC bool
345xfs_imap_valid( 356xfs_imap_valid(
346 struct inode *inode, 357 struct inode *inode,
347 struct xfs_bmbt_irec *imap, 358 struct xfs_bmbt_irec *imap,
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
414STATIC void 425STATIC void
415xfs_start_page_writeback( 426xfs_start_page_writeback(
416 struct page *page, 427 struct page *page,
417 int clear_dirty, 428 int clear_dirty)
418 int buffers)
419{ 429{
420 ASSERT(PageLocked(page)); 430 ASSERT(PageLocked(page));
421 ASSERT(!PageWriteback(page)); 431 ASSERT(!PageWriteback(page));
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
434 set_page_writeback_keepwrite(page); 444 set_page_writeback_keepwrite(page);
435 445
436 unlock_page(page); 446 unlock_page(page);
437
438 /* If no buffers on the page are to be written, finish it here */
439 if (!buffers)
440 end_page_writeback(page);
441} 447}
442 448
443static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 449static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
446} 452}
447 453
448/* 454/*
449 * Submit all of the bios for all of the ioends we have saved up, covering the 455 * Submit all of the bios for an ioend. We are only passed a single ioend at a
450 * initial writepage page and also any probed pages. 456 * time; the caller is responsible for chaining prior to submission.
451 *
452 * Because we may have multiple ioends spanning a page, we need to start
453 * writeback on all the buffers before we submit them for I/O. If we mark the
454 * buffers as we got, then we can end up with a page that only has buffers
455 * marked async write and I/O complete on can occur before we mark the other
456 * buffers async write.
457 *
458 * The end result of this is that we trip a bug in end_page_writeback() because
459 * we call it twice for the one page as the code in end_buffer_async_write()
460 * assumes that all buffers on the page are started at the same time.
461 *
462 * The fix is two passes across the ioend list - one to start writeback on the
463 * buffer_heads, and then submit them for I/O on the second pass.
464 * 457 *
465 * If @fail is non-zero, it means that we have a situation where some part of 458 * If @fail is non-zero, it means that we have a situation where some part of
466 * the submission process has failed after we have marked paged for writeback 459 * the submission process has failed after we have marked paged for writeback
467 * and unlocked them. In this situation, we need to fail the ioend chain rather 460 * and unlocked them. In this situation, we need to fail the ioend chain rather
468 * than submit it to IO. This typically only happens on a filesystem shutdown. 461 * than submit it to IO. This typically only happens on a filesystem shutdown.
469 */ 462 */
470STATIC void 463STATIC int
471xfs_submit_ioend( 464xfs_submit_ioend(
472 struct writeback_control *wbc, 465 struct writeback_control *wbc,
473 xfs_ioend_t *ioend, 466 xfs_ioend_t *ioend,
474 int fail) 467 int status)
475{ 468{
476 xfs_ioend_t *head = ioend;
477 xfs_ioend_t *next;
478 struct buffer_head *bh; 469 struct buffer_head *bh;
479 struct bio *bio; 470 struct bio *bio;
480 sector_t lastblock = 0; 471 sector_t lastblock = 0;
481 472
482 /* Pass 1 - start writeback */ 473 /* Reserve log space if we might write beyond the on-disk inode size. */
483 do { 474 if (!status &&
484 next = ioend->io_list; 475 ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
485 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 476 status = xfs_setfilesize_trans_alloc(ioend);
486 xfs_start_buffer_writeback(bh); 477 /*
487 } while ((ioend = next) != NULL); 478 * If we are failing the IO now, just mark the ioend with an
479 * error and finish it. This will run IO completion immediately
480 * as there is only one reference to the ioend at this point in
481 * time.
482 */
483 if (status) {
484 ioend->io_error = status;
485 xfs_finish_ioend(ioend);
486 return status;
487 }
488 488
489 /* Pass 2 - submit I/O */ 489 bio = NULL;
490 ioend = head; 490 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
491 do {
492 next = ioend->io_list;
493 bio = NULL;
494 491
495 /* 492 if (!bio) {
496 * If we are failing the IO now, just mark the ioend with an 493retry:
497 * error and finish it. This will run IO completion immediately 494 bio = xfs_alloc_ioend_bio(bh);
498 * as there is only one reference to the ioend at this point in 495 } else if (bh->b_blocknr != lastblock + 1) {
499 * time. 496 xfs_submit_ioend_bio(wbc, ioend, bio);
500 */ 497 goto retry;
501 if (fail) {
502 ioend->io_error = fail;
503 xfs_finish_ioend(ioend);
504 continue;
505 } 498 }
506 499
507 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
508
509 if (!bio) {
510 retry:
511 bio = xfs_alloc_ioend_bio(bh);
512 } else if (bh->b_blocknr != lastblock + 1) {
513 xfs_submit_ioend_bio(wbc, ioend, bio);
514 goto retry;
515 }
516
517 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
518 xfs_submit_ioend_bio(wbc, ioend, bio);
519 goto retry;
520 }
521
522 lastblock = bh->b_blocknr;
523 }
524 if (bio)
525 xfs_submit_ioend_bio(wbc, ioend, bio); 501 xfs_submit_ioend_bio(wbc, ioend, bio);
526 xfs_finish_ioend(ioend); 502 goto retry;
527 } while ((ioend = next) != NULL); 503 }
528}
529
530/*
531 * Cancel submission of all buffer_heads so far in this endio.
532 * Toss the endio too. Only ever called for the initial page
533 * in a writepage request, so only ever one page.
534 */
535STATIC void
536xfs_cancel_ioend(
537 xfs_ioend_t *ioend)
538{
539 xfs_ioend_t *next;
540 struct buffer_head *bh, *next_bh;
541
542 do {
543 next = ioend->io_list;
544 bh = ioend->io_buffer_head;
545 do {
546 next_bh = bh->b_private;
547 clear_buffer_async_write(bh);
548 /*
549 * The unwritten flag is cleared when added to the
550 * ioend. We're not submitting for I/O so mark the
551 * buffer unwritten again for next time around.
552 */
553 if (ioend->io_type == XFS_IO_UNWRITTEN)
554 set_buffer_unwritten(bh);
555 unlock_buffer(bh);
556 } while ((bh = next_bh) != NULL);
557 504
558 mempool_free(ioend, xfs_ioend_pool); 505 lastblock = bh->b_blocknr;
559 } while ((ioend = next) != NULL); 506 }
507 if (bio)
508 xfs_submit_ioend_bio(wbc, ioend, bio);
509 xfs_finish_ioend(ioend);
510 return 0;
560} 511}
561 512
562/* 513/*
563 * Test to see if we've been building up a completion structure for 514 * Test to see if we've been building up a completion structure for
564 * earlier buffers -- if so, we try to append to this ioend if we 515 * earlier buffers -- if so, we try to append to this ioend if we
565 * can, otherwise we finish off any current ioend and start another. 516 * can, otherwise we finish off any current ioend and start another.
566 * Return true if we've finished the given ioend. 517 * Return the ioend we finished off so that the caller can submit it
518 * once it has finished processing the dirty page.
567 */ 519 */
568STATIC void 520STATIC void
569xfs_add_to_ioend( 521xfs_add_to_ioend(
570 struct inode *inode, 522 struct inode *inode,
571 struct buffer_head *bh, 523 struct buffer_head *bh,
572 xfs_off_t offset, 524 xfs_off_t offset,
573 unsigned int type, 525 struct xfs_writepage_ctx *wpc,
574 xfs_ioend_t **result, 526 struct list_head *iolist)
575 int need_ioend)
576{ 527{
577 xfs_ioend_t *ioend = *result; 528 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
578 529 bh->b_blocknr != wpc->last_block + 1 ||
579 if (!ioend || need_ioend || type != ioend->io_type) { 530 offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
580 xfs_ioend_t *previous = *result; 531 struct xfs_ioend *new;
581 532
582 ioend = xfs_alloc_ioend(inode, type); 533 if (wpc->ioend)
583 ioend->io_offset = offset; 534 list_add(&wpc->ioend->io_list, iolist);
584 ioend->io_buffer_head = bh; 535
585 ioend->io_buffer_tail = bh; 536 new = xfs_alloc_ioend(inode, wpc->io_type);
586 if (previous) 537 new->io_offset = offset;
587 previous->io_list = ioend; 538 new->io_buffer_head = bh;
588 *result = ioend; 539 new->io_buffer_tail = bh;
540 wpc->ioend = new;
589 } else { 541 } else {
590 ioend->io_buffer_tail->b_private = bh; 542 wpc->ioend->io_buffer_tail->b_private = bh;
591 ioend->io_buffer_tail = bh; 543 wpc->ioend->io_buffer_tail = bh;
592 } 544 }
593 545
594 bh->b_private = NULL; 546 bh->b_private = NULL;
595 ioend->io_size += bh->b_size; 547 wpc->ioend->io_size += bh->b_size;
548 wpc->last_block = bh->b_blocknr;
549 xfs_start_buffer_writeback(bh);
596} 550}
597 551
598STATIC void 552STATIC void
@@ -678,183 +632,6 @@ xfs_check_page_type(
678 return false; 632 return false;
679} 633}
680 634
681/*
682 * Allocate & map buffers for page given the extent map. Write it out.
683 * except for the original page of a writepage, this is called on
684 * delalloc/unwritten pages only, for the original page it is possible
685 * that the page has no mapping at all.
686 */
687STATIC int
688xfs_convert_page(
689 struct inode *inode,
690 struct page *page,
691 loff_t tindex,
692 struct xfs_bmbt_irec *imap,
693 xfs_ioend_t **ioendp,
694 struct writeback_control *wbc)
695{
696 struct buffer_head *bh, *head;
697 xfs_off_t end_offset;
698 unsigned long p_offset;
699 unsigned int type;
700 int len, page_dirty;
701 int count = 0, done = 0, uptodate = 1;
702 xfs_off_t offset = page_offset(page);
703
704 if (page->index != tindex)
705 goto fail;
706 if (!trylock_page(page))
707 goto fail;
708 if (PageWriteback(page))
709 goto fail_unlock_page;
710 if (page->mapping != inode->i_mapping)
711 goto fail_unlock_page;
712 if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
713 goto fail_unlock_page;
714
715 /*
716 * page_dirty is initially a count of buffers on the page before
717 * EOF and is decremented as we move each into a cleanable state.
718 *
719 * Derivation:
720 *
721 * End offset is the highest offset that this page should represent.
722 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
723 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
724 * hence give us the correct page_dirty count. On any other page,
725 * it will be zero and in that case we need page_dirty to be the
726 * count of buffers on the page.
727 */
728 end_offset = min_t(unsigned long long,
729 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
730 i_size_read(inode));
731
732 /*
733 * If the current map does not span the entire page we are about to try
734 * to write, then give up. The only way we can write a page that spans
735 * multiple mappings in a single writeback iteration is via the
736 * xfs_vm_writepage() function. Data integrity writeback requires the
737 * entire page to be written in a single attempt, otherwise the part of
738 * the page we don't write here doesn't get written as part of the data
739 * integrity sync.
740 *
741 * For normal writeback, we also don't attempt to write partial pages
742 * here as it simply means that write_cache_pages() will see it under
743 * writeback and ignore the page until some point in the future, at
744 * which time this will be the only page in the file that needs
745 * writeback. Hence for more optimal IO patterns, we should always
746 * avoid partial page writeback due to multiple mappings on a page here.
747 */
748 if (!xfs_imap_valid(inode, imap, end_offset))
749 goto fail_unlock_page;
750
751 len = 1 << inode->i_blkbits;
752 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
753 PAGE_CACHE_SIZE);
754 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
755 page_dirty = p_offset / len;
756
757 /*
758 * The moment we find a buffer that doesn't match our current type
759 * specification or can't be written, abort the loop and start
760 * writeback. As per the above xfs_imap_valid() check, only
761 * xfs_vm_writepage() can handle partial page writeback fully - we are
762 * limited here to the buffers that are contiguous with the current
763 * ioend, and hence a buffer we can't write breaks that contiguity and
764 * we have to defer the rest of the IO to xfs_vm_writepage().
765 */
766 bh = head = page_buffers(page);
767 do {
768 if (offset >= end_offset)
769 break;
770 if (!buffer_uptodate(bh))
771 uptodate = 0;
772 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
773 done = 1;
774 break;
775 }
776
777 if (buffer_unwritten(bh) || buffer_delay(bh) ||
778 buffer_mapped(bh)) {
779 if (buffer_unwritten(bh))
780 type = XFS_IO_UNWRITTEN;
781 else if (buffer_delay(bh))
782 type = XFS_IO_DELALLOC;
783 else
784 type = XFS_IO_OVERWRITE;
785
786 /*
787 * imap should always be valid because of the above
788 * partial page end_offset check on the imap.
789 */
790 ASSERT(xfs_imap_valid(inode, imap, offset));
791
792 lock_buffer(bh);
793 if (type != XFS_IO_OVERWRITE)
794 xfs_map_at_offset(inode, bh, imap, offset);
795 xfs_add_to_ioend(inode, bh, offset, type,
796 ioendp, done);
797
798 page_dirty--;
799 count++;
800 } else {
801 done = 1;
802 break;
803 }
804 } while (offset += len, (bh = bh->b_this_page) != head);
805
806 if (uptodate && bh == head)
807 SetPageUptodate(page);
808
809 if (count) {
810 if (--wbc->nr_to_write <= 0 &&
811 wbc->sync_mode == WB_SYNC_NONE)
812 done = 1;
813 }
814 xfs_start_page_writeback(page, !page_dirty, count);
815
816 return done;
817 fail_unlock_page:
818 unlock_page(page);
819 fail:
820 return 1;
821}
822
823/*
824 * Convert & write out a cluster of pages in the same extent as defined
825 * by mp and following the start page.
826 */
827STATIC void
828xfs_cluster_write(
829 struct inode *inode,
830 pgoff_t tindex,
831 struct xfs_bmbt_irec *imap,
832 xfs_ioend_t **ioendp,
833 struct writeback_control *wbc,
834 pgoff_t tlast)
835{
836 struct pagevec pvec;
837 int done = 0, i;
838
839 pagevec_init(&pvec, 0);
840 while (!done && tindex <= tlast) {
841 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
842
843 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
844 break;
845
846 for (i = 0; i < pagevec_count(&pvec); i++) {
847 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
848 imap, ioendp, wbc);
849 if (done)
850 break;
851 }
852
853 pagevec_release(&pvec);
854 cond_resched();
855 }
856}
857
858STATIC void 635STATIC void
859xfs_vm_invalidatepage( 636xfs_vm_invalidatepage(
860 struct page *page, 637 struct page *page,
@@ -932,6 +709,164 @@ out_invalidate:
932} 709}
933 710
934/* 711/*
712 * We implement an immediate ioend submission policy here to avoid needing to
713 * chain multiple ioends and hence nest mempool allocations which can violate
714 * forward progress guarantees we need to provide. The current ioend we are
715 * adding buffers to is cached on the writepage context, and if the new buffer
716 * does not append to the cached ioend it will create a new ioend and cache that
717 * instead.
718 *
719 * If a new ioend is created and cached, the old ioend is returned and queued
720 * locally for submission once the entire page is processed or an error has been
721 * detected. While ioends are submitted immediately after they are completed,
722 * batching optimisations are provided by higher level block plugging.
723 *
724 * At the end of a writeback pass, there will be a cached ioend remaining on the
725 * writepage context that the caller will need to submit.
726 */
727static int
728xfs_writepage_map(
729 struct xfs_writepage_ctx *wpc,
730 struct writeback_control *wbc,
731 struct inode *inode,
732 struct page *page,
733 loff_t offset,
734 __uint64_t end_offset)
735{
736 LIST_HEAD(submit_list);
737 struct xfs_ioend *ioend, *next;
738 struct buffer_head *bh, *head;
739 ssize_t len = 1 << inode->i_blkbits;
740 int error = 0;
741 int count = 0;
742 int uptodate = 1;
743
744 bh = head = page_buffers(page);
745 offset = page_offset(page);
746 do {
747 if (offset >= end_offset)
748 break;
749 if (!buffer_uptodate(bh))
750 uptodate = 0;
751
752 /*
753 * set_page_dirty dirties all buffers in a page, independent
754 * of their state. The dirty state however is entirely
755 * meaningless for holes (!mapped && uptodate), so skip
756 * buffers covering holes here.
757 */
758 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
759 wpc->imap_valid = false;
760 continue;
761 }
762
763 if (buffer_unwritten(bh)) {
764 if (wpc->io_type != XFS_IO_UNWRITTEN) {
765 wpc->io_type = XFS_IO_UNWRITTEN;
766 wpc->imap_valid = false;
767 }
768 } else if (buffer_delay(bh)) {
769 if (wpc->io_type != XFS_IO_DELALLOC) {
770 wpc->io_type = XFS_IO_DELALLOC;
771 wpc->imap_valid = false;
772 }
773 } else if (buffer_uptodate(bh)) {
774 if (wpc->io_type != XFS_IO_OVERWRITE) {
775 wpc->io_type = XFS_IO_OVERWRITE;
776 wpc->imap_valid = false;
777 }
778 } else {
779 if (PageUptodate(page))
780 ASSERT(buffer_mapped(bh));
781 /*
782 * This buffer is not uptodate and will not be
783 * written to disk. Ensure that we will put any
784 * subsequent writeable buffers into a new
785 * ioend.
786 */
787 wpc->imap_valid = false;
788 continue;
789 }
790
791 if (wpc->imap_valid)
792 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
793 offset);
794 if (!wpc->imap_valid) {
795 error = xfs_map_blocks(inode, offset, &wpc->imap,
796 wpc->io_type);
797 if (error)
798 goto out;
799 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
800 offset);
801 }
802 if (wpc->imap_valid) {
803 lock_buffer(bh);
804 if (wpc->io_type != XFS_IO_OVERWRITE)
805 xfs_map_at_offset(inode, bh, &wpc->imap, offset);
806 xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
807 count++;
808 }
809
810 } while (offset += len, ((bh = bh->b_this_page) != head));
811
812 if (uptodate && bh == head)
813 SetPageUptodate(page);
814
815 ASSERT(wpc->ioend || list_empty(&submit_list));
816
817out:
818 /*
819 * On error, we have to fail the ioend here because we have locked
820 * buffers in the ioend. If we don't do this, we'll deadlock
821 * invalidating the page as that tries to lock the buffers on the page.
822 * Also, because we may have set pages under writeback, we have to make
823 * sure we run IO completion to mark the error state of the IO
824 * appropriately, so we can't cancel the ioend directly here. That means
825 * we have to mark this page as under writeback if we included any
826 * buffers from it in the ioend chain so that completion treats it
827 * correctly.
828 *
829 * If we didn't include the page in the ioend, the on error we can
830 * simply discard and unlock it as there are no other users of the page
831 * or it's buffers right now. The caller will still need to trigger
832 * submission of outstanding ioends on the writepage context so they are
833 * treated correctly on error.
834 */
835 if (count) {
836 xfs_start_page_writeback(page, !error);
837
838 /*
839 * Preserve the original error if there was one, otherwise catch
840 * submission errors here and propagate into subsequent ioend
841 * submissions.
842 */
843 list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
844 int error2;
845
846 list_del_init(&ioend->io_list);
847 error2 = xfs_submit_ioend(wbc, ioend, error);
848 if (error2 && !error)
849 error = error2;
850 }
851 } else if (error) {
852 xfs_aops_discard_page(page);
853 ClearPageUptodate(page);
854 unlock_page(page);
855 } else {
856 /*
857 * We can end up here with no error and nothing to write if we
858 * race with a partial page truncate on a sub-page block sized
859 * filesystem. In that case we need to mark the page clean.
860 */
861 xfs_start_page_writeback(page, 1);
862 end_page_writeback(page);
863 }
864
865 mapping_set_error(page->mapping, error);
866 return error;
867}
868
869/*
935 * Write out a dirty page. 870 * Write out a dirty page.
936 * 871 *
937 * For delalloc space on the page we need to allocate space and flush it. 872 * For delalloc space on the page we need to allocate space and flush it.
@@ -940,22 +875,16 @@ out_invalidate:
940 * For any other dirty buffer heads on the page we should flush them. 875 * For any other dirty buffer heads on the page we should flush them.
941 */ 876 */
942STATIC int 877STATIC int
943xfs_vm_writepage( 878xfs_do_writepage(
944 struct page *page, 879 struct page *page,
945 struct writeback_control *wbc) 880 struct writeback_control *wbc,
881 void *data)
946{ 882{
883 struct xfs_writepage_ctx *wpc = data;
947 struct inode *inode = page->mapping->host; 884 struct inode *inode = page->mapping->host;
948 struct buffer_head *bh, *head;
949 struct xfs_bmbt_irec imap;
950 xfs_ioend_t *ioend = NULL, *iohead = NULL;
951 loff_t offset; 885 loff_t offset;
952 unsigned int type;
953 __uint64_t end_offset; 886 __uint64_t end_offset;
954 pgoff_t end_index, last_index; 887 pgoff_t end_index;
955 ssize_t len;
956 int err, imap_valid = 0, uptodate = 1;
957 int count = 0;
958 int nonblocking = 0;
959 888
960 trace_xfs_writepage(inode, page, 0, 0); 889 trace_xfs_writepage(inode, page, 0, 0);
961 890
@@ -982,12 +911,9 @@ xfs_vm_writepage(
982 if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) 911 if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
983 goto redirty; 912 goto redirty;
984 913
985 /* Is this page beyond the end of the file? */
986 offset = i_size_read(inode);
987 end_index = offset >> PAGE_CACHE_SHIFT;
988 last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
989
990 /* 914 /*
915 * Is this page beyond the end of the file?
916 *
991 * The page index is less than the end_index, adjust the end_offset 917 * The page index is less than the end_index, adjust the end_offset
992 * to the highest offset that this page should represent. 918 * to the highest offset that this page should represent.
993 * ----------------------------------------------------- 919 * -----------------------------------------------------
@@ -998,6 +924,8 @@ xfs_vm_writepage(
998 * | desired writeback range | see else | 924 * | desired writeback range | see else |
999 * ---------------------------------^------------------| 925 * ---------------------------------^------------------|
1000 */ 926 */
927 offset = i_size_read(inode);
928 end_index = offset >> PAGE_CACHE_SHIFT;
1001 if (page->index < end_index) 929 if (page->index < end_index)
1002 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; 930 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
1003 else { 931 else {
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
1049 end_offset = offset; 977 end_offset = offset;
1050 } 978 }
1051 979
1052 len = 1 << inode->i_blkbits; 980 return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
1053
1054 bh = head = page_buffers(page);
1055 offset = page_offset(page);
1056 type = XFS_IO_OVERWRITE;
1057
1058 if (wbc->sync_mode == WB_SYNC_NONE)
1059 nonblocking = 1;
1060
1061 do {
1062 int new_ioend = 0;
1063
1064 if (offset >= end_offset)
1065 break;
1066 if (!buffer_uptodate(bh))
1067 uptodate = 0;
1068
1069 /*
1070 * set_page_dirty dirties all buffers in a page, independent
1071 * of their state. The dirty state however is entirely
1072 * meaningless for holes (!mapped && uptodate), so skip
1073 * buffers covering holes here.
1074 */
1075 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1076 imap_valid = 0;
1077 continue;
1078 }
1079
1080 if (buffer_unwritten(bh)) {
1081 if (type != XFS_IO_UNWRITTEN) {
1082 type = XFS_IO_UNWRITTEN;
1083 imap_valid = 0;
1084 }
1085 } else if (buffer_delay(bh)) {
1086 if (type != XFS_IO_DELALLOC) {
1087 type = XFS_IO_DELALLOC;
1088 imap_valid = 0;
1089 }
1090 } else if (buffer_uptodate(bh)) {
1091 if (type != XFS_IO_OVERWRITE) {
1092 type = XFS_IO_OVERWRITE;
1093 imap_valid = 0;
1094 }
1095 } else {
1096 if (PageUptodate(page))
1097 ASSERT(buffer_mapped(bh));
1098 /*
1099 * This buffer is not uptodate and will not be
1100 * written to disk. Ensure that we will put any
1101 * subsequent writeable buffers into a new
1102 * ioend.
1103 */
1104 imap_valid = 0;
1105 continue;
1106 }
1107
1108 if (imap_valid)
1109 imap_valid = xfs_imap_valid(inode, &imap, offset);
1110 if (!imap_valid) {
1111 /*
1112 * If we didn't have a valid mapping then we need to
1113 * put the new mapping into a separate ioend structure.
1114 * This ensures non-contiguous extents always have
1115 * separate ioends, which is particularly important
1116 * for unwritten extent conversion at I/O completion
1117 * time.
1118 */
1119 new_ioend = 1;
1120 err = xfs_map_blocks(inode, offset, &imap, type,
1121 nonblocking);
1122 if (err)
1123 goto error;
1124 imap_valid = xfs_imap_valid(inode, &imap, offset);
1125 }
1126 if (imap_valid) {
1127 lock_buffer(bh);
1128 if (type != XFS_IO_OVERWRITE)
1129 xfs_map_at_offset(inode, bh, &imap, offset);
1130 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1131 new_ioend);
1132 count++;
1133 }
1134
1135 if (!iohead)
1136 iohead = ioend;
1137
1138 } while (offset += len, ((bh = bh->b_this_page) != head));
1139
1140 if (uptodate && bh == head)
1141 SetPageUptodate(page);
1142
1143 xfs_start_page_writeback(page, 1, count);
1144
1145 /* if there is no IO to be submitted for this page, we are done */
1146 if (!ioend)
1147 return 0;
1148
1149 ASSERT(iohead);
1150
1151 /*
1152 * Any errors from this point onwards need tobe reported through the IO
1153 * completion path as we have marked the initial page as under writeback
1154 * and unlocked it.
1155 */
1156 if (imap_valid) {
1157 xfs_off_t end_index;
1158
1159 end_index = imap.br_startoff + imap.br_blockcount;
1160
1161 /* to bytes */
1162 end_index <<= inode->i_blkbits;
1163
1164 /* to pages */
1165 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1166
1167 /* check against file size */
1168 if (end_index > last_index)
1169 end_index = last_index;
1170
1171 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1172 wbc, end_index);
1173 }
1174
1175
1176 /*
1177 * Reserve log space if we might write beyond the on-disk inode size.
1178 */
1179 err = 0;
1180 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1181 err = xfs_setfilesize_trans_alloc(ioend);
1182
1183 xfs_submit_ioend(wbc, iohead, err);
1184
1185 return 0;
1186
1187error:
1188 if (iohead)
1189 xfs_cancel_ioend(iohead);
1190
1191 if (err == -EAGAIN)
1192 goto redirty;
1193
1194 xfs_aops_discard_page(page);
1195 ClearPageUptodate(page);
1196 unlock_page(page);
1197 return err;
1198 981
1199redirty: 982redirty:
1200 redirty_page_for_writepage(wbc, page); 983 redirty_page_for_writepage(wbc, page);
@@ -1203,16 +986,40 @@ redirty:
1203} 986}
1204 987
1205STATIC int 988STATIC int
989xfs_vm_writepage(
990 struct page *page,
991 struct writeback_control *wbc)
992{
993 struct xfs_writepage_ctx wpc = {
994 .io_type = XFS_IO_INVALID,
995 };
996 int ret;
997
998 ret = xfs_do_writepage(page, wbc, &wpc);
999 if (wpc.ioend)
1000 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1001 return ret;
1002}
1003
1004STATIC int
1206xfs_vm_writepages( 1005xfs_vm_writepages(
1207 struct address_space *mapping, 1006 struct address_space *mapping,
1208 struct writeback_control *wbc) 1007 struct writeback_control *wbc)
1209{ 1008{
1009 struct xfs_writepage_ctx wpc = {
1010 .io_type = XFS_IO_INVALID,
1011 };
1012 int ret;
1013
1210 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1014 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1211 if (dax_mapping(mapping)) 1015 if (dax_mapping(mapping))
1212 return dax_writeback_mapping_range(mapping, 1016 return dax_writeback_mapping_range(mapping,
1213 xfs_find_bdev_for_inode(mapping->host), wbc); 1017 xfs_find_bdev_for_inode(mapping->host), wbc);
1214 1018
1215 return generic_writepages(mapping, wbc); 1019 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1020 if (wpc.ioend)
1021 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1022 return ret;
1216} 1023}
1217 1024
1218/* 1025/*
@@ -1242,27 +1049,8 @@ xfs_vm_releasepage(
1242} 1049}
1243 1050
1244/* 1051/*
1245 * When we map a DIO buffer, we may need to attach an ioend that describes the 1052 * When we map a DIO buffer, we may need to pass flags to
1246 * type of write IO we are doing. This passes to the completion function the 1053 * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
1247 * operations it needs to perform. If the mapping is for an overwrite wholly
1248 * within the EOF then we don't need an ioend and so we don't allocate one.
1249 * This avoids the unnecessary overhead of allocating and freeing ioends for
1250 * workloads that don't require transactions on IO completion.
1251 *
1252 * If we get multiple mappings in a single IO, we might be mapping different
1253 * types. But because the direct IO can only have a single private pointer, we
1254 * need to ensure that:
1255 *
1256 * a) i) the ioend spans the entire region of unwritten mappings; or
1257 * ii) the ioend spans all the mappings that cross or are beyond EOF; and
1258 * b) if it contains unwritten extents, it is *permanently* marked as such
1259 *
1260 * We could do this by chaining ioends like buffered IO does, but we only
1261 * actually get one IO completion callback from the direct IO, and that spans
1262 * the entire IO regardless of how many mappings and IOs are needed to complete
1263 * the DIO. There is only going to be one reference to the ioend and its life
1264 * cycle is constrained by the DIO completion code. hence we don't need
1265 * reference counting here.
1266 * 1054 *
1267 * Note that for DIO, an IO to the highest supported file block offset (i.e. 1055 * Note that for DIO, an IO to the highest supported file block offset (i.e.
1268 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 1056 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1270,68 +1058,26 @@ xfs_vm_releasepage(
1270 * extending the file size. We won't know for sure until IO completion is run 1058 * extending the file size. We won't know for sure until IO completion is run
1271 * and the actual max write offset is communicated to the IO completion 1059 * and the actual max write offset is communicated to the IO completion
1272 * routine. 1060 * routine.
1273 *
1274 * For DAX page faults, we are preparing to never see unwritten extents here,
1275 * nor should we ever extend the inode size. Hence we will soon have nothing to
1276 * do here for this case, ensuring we don't have to provide an IO completion
1277 * callback to free an ioend that we don't actually need for a fault into the
1278 * page at offset (2^63 - 1FSB) bytes.
1279 */ 1061 */
1280
1281static void 1062static void
1282xfs_map_direct( 1063xfs_map_direct(
1283 struct inode *inode, 1064 struct inode *inode,
1284 struct buffer_head *bh_result, 1065 struct buffer_head *bh_result,
1285 struct xfs_bmbt_irec *imap, 1066 struct xfs_bmbt_irec *imap,
1286 xfs_off_t offset, 1067 xfs_off_t offset)
1287 bool dax_fault)
1288{ 1068{
1289 struct xfs_ioend *ioend; 1069 uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
1290 xfs_off_t size = bh_result->b_size; 1070 xfs_off_t size = bh_result->b_size;
1291 int type;
1292
1293 if (ISUNWRITTEN(imap))
1294 type = XFS_IO_UNWRITTEN;
1295 else
1296 type = XFS_IO_OVERWRITE;
1297 1071
1298 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); 1072 trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
1299 1073 ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
1300 if (dax_fault) {
1301 ASSERT(type == XFS_IO_OVERWRITE);
1302 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1303 imap);
1304 return;
1305 }
1306 1074
1307 if (bh_result->b_private) { 1075 if (ISUNWRITTEN(imap)) {
1308 ioend = bh_result->b_private; 1076 *flags |= XFS_DIO_FLAG_UNWRITTEN;
1309 ASSERT(ioend->io_size > 0); 1077 set_buffer_defer_completion(bh_result);
1310 ASSERT(offset >= ioend->io_offset); 1078 } else if (offset + size > i_size_read(inode) || offset + size < 0) {
1311 if (offset + size > ioend->io_offset + ioend->io_size) 1079 *flags |= XFS_DIO_FLAG_APPEND;
1312 ioend->io_size = offset - ioend->io_offset + size;
1313
1314 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
1315 ioend->io_type = XFS_IO_UNWRITTEN;
1316
1317 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
1318 ioend->io_size, ioend->io_type,
1319 imap);
1320 } else if (type == XFS_IO_UNWRITTEN ||
1321 offset + size > i_size_read(inode) ||
1322 offset + size < 0) {
1323 ioend = xfs_alloc_ioend(inode, type);
1324 ioend->io_offset = offset;
1325 ioend->io_size = size;
1326
1327 bh_result->b_private = ioend;
1328 set_buffer_defer_completion(bh_result); 1080 set_buffer_defer_completion(bh_result);
1329
1330 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
1331 imap);
1332 } else {
1333 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1334 imap);
1335 } 1081 }
1336} 1082}
1337 1083
@@ -1502,9 +1248,12 @@ __xfs_get_blocks(
1502 if (ISUNWRITTEN(&imap)) 1248 if (ISUNWRITTEN(&imap))
1503 set_buffer_unwritten(bh_result); 1249 set_buffer_unwritten(bh_result);
1504 /* direct IO needs special help */ 1250 /* direct IO needs special help */
1505 if (create && direct) 1251 if (create && direct) {
1506 xfs_map_direct(inode, bh_result, &imap, offset, 1252 if (dax_fault)
1507 dax_fault); 1253 ASSERT(!ISUNWRITTEN(&imap));
1254 else
1255 xfs_map_direct(inode, bh_result, &imap, offset);
1256 }
1508 } 1257 }
1509 1258
1510 /* 1259 /*
@@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault(
1574 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); 1323 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1575} 1324}
1576 1325
1577static void 1326/*
1578__xfs_end_io_direct_write( 1327 * Complete a direct I/O write request.
1579 struct inode *inode, 1328 *
1580 struct xfs_ioend *ioend, 1329 * xfs_map_direct passes us some flags in the private data to tell us what to
1330 * do. If no flags are set, then the write IO is an overwrite wholly within
1331 * the existing allocated file size and so there is nothing for us to do.
1332 *
1333 * Note that in this case the completion can be called in interrupt context,
1334 * whereas if we have flags set we will always be called in task context
1335 * (i.e. from a workqueue).
1336 */
1337STATIC int
1338xfs_end_io_direct_write(
1339 struct kiocb *iocb,
1581 loff_t offset, 1340 loff_t offset,
1582 ssize_t size) 1341 ssize_t size,
1342 void *private)
1583{ 1343{
1584 struct xfs_mount *mp = XFS_I(inode)->i_mount; 1344 struct inode *inode = file_inode(iocb->ki_filp);
1345 struct xfs_inode *ip = XFS_I(inode);
1346 struct xfs_mount *mp = ip->i_mount;
1347 uintptr_t flags = (uintptr_t)private;
1348 int error = 0;
1585 1349
1586 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) 1350 trace_xfs_end_io_direct_write(ip, offset, size);
1587 goto out_end_io;
1588 1351
1589 /* 1352 if (XFS_FORCED_SHUTDOWN(mp))
1590 * dio completion end_io functions are only called on writes if more 1353 return -EIO;
1591 * than 0 bytes was written.
1592 */
1593 ASSERT(size > 0);
1594 1354
1595 /* 1355 if (size <= 0)
1596 * The ioend only maps whole blocks, while the IO may be sector aligned. 1356 return size;
1597 * Hence the ioend offset/size may not match the IO offset/size exactly.
1598 * Because we don't map overwrites within EOF into the ioend, the offset
1599 * may not match, but only if the endio spans EOF. Either way, write
1600 * the IO sizes into the ioend so that completion processing does the
1601 * right thing.
1602 */
1603 ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
1604 ioend->io_size = size;
1605 ioend->io_offset = offset;
1606 1357
1607 /* 1358 /*
1608 * The ioend tells us whether we are doing unwritten extent conversion 1359 * The flags tell us whether we are doing unwritten extent conversions
1609 * or an append transaction that updates the on-disk file size. These 1360 * or an append transaction that updates the on-disk file size. These
1610 * cases are the only cases where we should *potentially* be needing 1361 * cases are the only cases where we should *potentially* be needing
1611 * to update the VFS inode size. 1362 * to update the VFS inode size.
1612 * 1363 */
1364 if (flags == 0) {
1365 ASSERT(offset + size <= i_size_read(inode));
1366 return 0;
1367 }
1368
1369 /*
1613 * We need to update the in-core inode size here so that we don't end up 1370 * We need to update the in-core inode size here so that we don't end up
1614 * with the on-disk inode size being outside the in-core inode size. We 1371 * with the on-disk inode size being outside the in-core inode size. We
1615 * have no other method of updating EOF for AIO, so always do it here 1372 * have no other method of updating EOF for AIO, so always do it here
@@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write(
1620 * here can result in EOF moving backwards and Bad Things Happen when 1377 * here can result in EOF moving backwards and Bad Things Happen when
1621 * that occurs. 1378 * that occurs.
1622 */ 1379 */
1623 spin_lock(&XFS_I(inode)->i_flags_lock); 1380 spin_lock(&ip->i_flags_lock);
1624 if (offset + size > i_size_read(inode)) 1381 if (offset + size > i_size_read(inode))
1625 i_size_write(inode, offset + size); 1382 i_size_write(inode, offset + size);
1626 spin_unlock(&XFS_I(inode)->i_flags_lock); 1383 spin_unlock(&ip->i_flags_lock);
1627 1384
1628 /* 1385 if (flags & XFS_DIO_FLAG_UNWRITTEN) {
1629 * If we are doing an append IO that needs to update the EOF on disk, 1386 trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
1630 * do the transaction reserve now so we can use common end io
1631 * processing. Stashing the error (if there is one) in the ioend will
1632 * result in the ioend processing passing on the error if it is
1633 * possible as we can't return it from here.
1634 */
1635 if (ioend->io_type == XFS_IO_OVERWRITE)
1636 ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1637 1387
1638out_end_io: 1388 error = xfs_iomap_write_unwritten(ip, offset, size);
1639 xfs_end_io(&ioend->io_work); 1389 } else if (flags & XFS_DIO_FLAG_APPEND) {
1640 return; 1390 struct xfs_trans *tp;
1641}
1642 1391
1643/* 1392 trace_xfs_end_io_direct_write_append(ip, offset, size);
1644 * Complete a direct I/O write request.
1645 *
1646 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1647 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1648 * wholly within the EOF and so there is nothing for us to do. Note that in this
1649 * case the completion can be called in interrupt context, whereas if we have an
1650 * ioend we will always be called in task context (i.e. from a workqueue).
1651 */
1652STATIC void
1653xfs_end_io_direct_write(
1654 struct kiocb *iocb,
1655 loff_t offset,
1656 ssize_t size,
1657 void *private)
1658{
1659 struct inode *inode = file_inode(iocb->ki_filp);
1660 struct xfs_ioend *ioend = private;
1661
1662 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
1663 ioend ? ioend->io_type : 0, NULL);
1664 1393
1665 if (!ioend) { 1394 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1666 ASSERT(offset + size <= i_size_read(inode)); 1395 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1667 return; 1396 if (error) {
1397 xfs_trans_cancel(tp);
1398 return error;
1399 }
1400 error = xfs_setfilesize(ip, tp, offset, size);
1668 } 1401 }
1669 1402
1670 __xfs_end_io_direct_write(inode, ioend, offset, size); 1403 return error;
1671} 1404}
1672 1405
1673static inline ssize_t 1406STATIC ssize_t
1674xfs_vm_do_dio( 1407xfs_vm_direct_IO(
1675 struct inode *inode,
1676 struct kiocb *iocb, 1408 struct kiocb *iocb,
1677 struct iov_iter *iter, 1409 struct iov_iter *iter,
1678 loff_t offset, 1410 loff_t offset)
1679 void (*endio)(struct kiocb *iocb,
1680 loff_t offset,
1681 ssize_t size,
1682 void *private),
1683 int flags)
1684{ 1411{
1412 struct inode *inode = iocb->ki_filp->f_mapping->host;
1413 dio_iodone_t *endio = NULL;
1414 int flags = 0;
1685 struct block_device *bdev; 1415 struct block_device *bdev;
1686 1416
1687 if (IS_DAX(inode)) 1417 if (iov_iter_rw(iter) == WRITE) {
1418 endio = xfs_end_io_direct_write;
1419 flags = DIO_ASYNC_EXTEND;
1420 }
1421
1422 if (IS_DAX(inode)) {
1688 return dax_do_io(iocb, inode, iter, offset, 1423 return dax_do_io(iocb, inode, iter, offset,
1689 xfs_get_blocks_direct, endio, 0); 1424 xfs_get_blocks_direct, endio, 0);
1425 }
1690 1426
1691 bdev = xfs_find_bdev_for_inode(inode); 1427 bdev = xfs_find_bdev_for_inode(inode);
1692 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1428 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
1693 xfs_get_blocks_direct, endio, NULL, flags); 1429 xfs_get_blocks_direct, endio, NULL, flags);
1694}
1695
1696STATIC ssize_t
1697xfs_vm_direct_IO(
1698 struct kiocb *iocb,
1699 struct iov_iter *iter,
1700 loff_t offset)
1701{
1702 struct inode *inode = iocb->ki_filp->f_mapping->host;
1703
1704 if (iov_iter_rw(iter) == WRITE)
1705 return xfs_vm_do_dio(inode, iocb, iter, offset,
1706 xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
1707 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
1708} 1430}
1709 1431
1710/* 1432/*
@@ -1756,6 +1478,7 @@ xfs_vm_write_failed(
1756 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1478 loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1757 loff_t to = from + len; 1479 loff_t to = from + len;
1758 struct buffer_head *bh, *head; 1480 struct buffer_head *bh, *head;
1481 struct xfs_mount *mp = XFS_I(inode)->i_mount;
1759 1482
1760 /* 1483 /*
1761 * The request pos offset might be 32 or 64 bit, this is all fine 1484 * The request pos offset might be 32 or 64 bit, this is all fine
@@ -1787,14 +1510,23 @@ xfs_vm_write_failed(
1787 if (block_start >= to) 1510 if (block_start >= to)
1788 break; 1511 break;
1789 1512
1790 if (!buffer_delay(bh)) 1513 /*
1514 * Process delalloc and unwritten buffers beyond EOF. We can
1515 * encounter unwritten buffers in the event that a file has
1516 * post-EOF unwritten extents and an extending write happens to
1517 * fail (e.g., an unaligned write that also involves a delalloc
1518 * to the same page).
1519 */
1520 if (!buffer_delay(bh) && !buffer_unwritten(bh))
1791 continue; 1521 continue;
1792 1522
1793 if (!buffer_new(bh) && block_offset < i_size_read(inode)) 1523 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1524 block_offset < i_size_read(inode))
1794 continue; 1525 continue;
1795 1526
1796 xfs_vm_kill_delalloc_range(inode, block_offset, 1527 if (buffer_delay(bh))
1797 block_offset + bh->b_size); 1528 xfs_vm_kill_delalloc_range(inode, block_offset,
1529 block_offset + bh->b_size);
1798 1530
1799 /* 1531 /*
1800 * This buffer does not contain data anymore. make sure anyone 1532 * This buffer does not contain data anymore. make sure anyone
@@ -1805,6 +1537,7 @@ xfs_vm_write_failed(
1805 clear_buffer_mapped(bh); 1537 clear_buffer_mapped(bh);
1806 clear_buffer_new(bh); 1538 clear_buffer_new(bh);
1807 clear_buffer_dirty(bh); 1539 clear_buffer_dirty(bh);
1540 clear_buffer_unwritten(bh);
1808 } 1541 }
1809 1542
1810} 1543}
@@ -1828,6 +1561,7 @@ xfs_vm_write_begin(
1828 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1561 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1829 struct page *page; 1562 struct page *page;
1830 int status; 1563 int status;
1564 struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
1831 1565
1832 ASSERT(len <= PAGE_CACHE_SIZE); 1566 ASSERT(len <= PAGE_CACHE_SIZE);
1833 1567
@@ -1836,6 +1570,8 @@ xfs_vm_write_begin(
1836 return -ENOMEM; 1570 return -ENOMEM;
1837 1571
1838 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1572 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1573 if (xfs_mp_fail_writes(mp))
1574 status = -EIO;
1839 if (unlikely(status)) { 1575 if (unlikely(status)) {
1840 struct inode *inode = mapping->host; 1576 struct inode *inode = mapping->host;
1841 size_t isize = i_size_read(inode); 1577 size_t isize = i_size_read(inode);
@@ -1848,6 +1584,8 @@ xfs_vm_write_begin(
1848 * allocated in this write, not blocks that were previously 1584 * allocated in this write, not blocks that were previously
1849 * written successfully. 1585 * written successfully.
1850 */ 1586 */
1587 if (xfs_mp_fail_writes(mp))
1588 isize = 0;
1851 if (pos + len > isize) { 1589 if (pos + len > isize) {
1852 ssize_t start = max_t(ssize_t, pos, isize); 1590 ssize_t start = max_t(ssize_t, pos, isize);
1853 1591
@@ -1957,7 +1695,6 @@ xfs_vm_set_page_dirty(
1957 loff_t end_offset; 1695 loff_t end_offset;
1958 loff_t offset; 1696 loff_t offset;
1959 int newly_dirty; 1697 int newly_dirty;
1960 struct mem_cgroup *memcg;
1961 1698
1962 if (unlikely(!mapping)) 1699 if (unlikely(!mapping))
1963 return !TestSetPageDirty(page); 1700 return !TestSetPageDirty(page);
@@ -1978,10 +1715,10 @@ xfs_vm_set_page_dirty(
1978 } while (bh != head); 1715 } while (bh != head);
1979 } 1716 }
1980 /* 1717 /*
1981 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with 1718 * Lock out page->mem_cgroup migration to keep PageDirty
1982 * per-memcg dirty page counters. 1719 * synchronized with per-memcg dirty page counters.
1983 */ 1720 */
1984 memcg = mem_cgroup_begin_page_stat(page); 1721 lock_page_memcg(page);
1985 newly_dirty = !TestSetPageDirty(page); 1722 newly_dirty = !TestSetPageDirty(page);
1986 spin_unlock(&mapping->private_lock); 1723 spin_unlock(&mapping->private_lock);
1987 1724
@@ -1992,13 +1729,13 @@ xfs_vm_set_page_dirty(
1992 spin_lock_irqsave(&mapping->tree_lock, flags); 1729 spin_lock_irqsave(&mapping->tree_lock, flags);
1993 if (page->mapping) { /* Race with truncate? */ 1730 if (page->mapping) { /* Race with truncate? */
1994 WARN_ON_ONCE(!PageUptodate(page)); 1731 WARN_ON_ONCE(!PageUptodate(page));
1995 account_page_dirtied(page, mapping, memcg); 1732 account_page_dirtied(page, mapping);
1996 radix_tree_tag_set(&mapping->page_tree, 1733 radix_tree_tag_set(&mapping->page_tree,
1997 page_index(page), PAGECACHE_TAG_DIRTY); 1734 page_index(page), PAGECACHE_TAG_DIRTY);
1998 } 1735 }
1999 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1736 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2000 } 1737 }
2001 mem_cgroup_end_page_stat(memcg); 1738 unlock_page_memcg(page);
2002 if (newly_dirty) 1739 if (newly_dirty)
2003 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1740 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2004 return newly_dirty; 1741 return newly_dirty;