aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r--fs/ocfs2/aops.c1141
1 files changed, 561 insertions, 580 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index cda0361e95a4..1581240a7ca0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,153 +499,6 @@ bail:
499 return status; 499 return status;
500} 500}
501 501
502/*
503 * TODO: Make this into a generic get_blocks function.
504 *
505 * From do_direct_io in direct-io.c:
506 * "So what we do is to permit the ->get_blocks function to populate
507 * bh.b_size with the size of IO which is permitted at this offset and
508 * this i_blkbits."
509 *
510 * This function is called directly from get_more_blocks in direct-io.c.
511 *
512 * called like this: dio->get_blocks(dio->inode, fs_startblk,
513 * fs_count, map_bh, dio->rw == WRITE);
514 */
515static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
516 struct buffer_head *bh_result, int create)
517{
518 int ret;
519 u32 cpos = 0;
520 int alloc_locked = 0;
521 u64 p_blkno, inode_blocks, contig_blocks;
522 unsigned int ext_flags;
523 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
524 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
525 unsigned long len = bh_result->b_size;
526 unsigned int clusters_to_alloc = 0, contig_clusters = 0;
527
528 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
529
530 /* This function won't even be called if the request isn't all
531 * nicely aligned and of the right size, so there's no need
532 * for us to check any of that. */
533
534 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
535
536 down_read(&OCFS2_I(inode)->ip_alloc_sem);
537
538 /* This figures out the size of the next contiguous block, and
539 * our logical offset */
540 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
541 &contig_blocks, &ext_flags);
542 up_read(&OCFS2_I(inode)->ip_alloc_sem);
543
544 if (ret) {
545 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
546 (unsigned long long)iblock);
547 ret = -EIO;
548 goto bail;
549 }
550
551 /* We should already CoW the refcounted extent in case of create. */
552 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
553
554 /* allocate blocks if no p_blkno is found, and create == 1 */
555 if (!p_blkno && create) {
556 ret = ocfs2_inode_lock(inode, NULL, 1);
557 if (ret < 0) {
558 mlog_errno(ret);
559 goto bail;
560 }
561
562 alloc_locked = 1;
563
564 down_write(&OCFS2_I(inode)->ip_alloc_sem);
565
566 /* fill hole, allocate blocks can't be larger than the size
567 * of the hole */
568 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
569 contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
570 contig_blocks);
571 if (clusters_to_alloc > contig_clusters)
572 clusters_to_alloc = contig_clusters;
573
574 /* allocate extent and insert them into the extent tree */
575 ret = ocfs2_extend_allocation(inode, cpos,
576 clusters_to_alloc, 0);
577 if (ret < 0) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 mlog_errno(ret);
580 goto bail;
581 }
582
583 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
584 &contig_blocks, &ext_flags);
585 if (ret < 0) {
586 up_write(&OCFS2_I(inode)->ip_alloc_sem);
587 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
588 (unsigned long long)iblock);
589 ret = -EIO;
590 goto bail;
591 }
592 set_buffer_new(bh_result);
593 up_write(&OCFS2_I(inode)->ip_alloc_sem);
594 }
595
596 /*
597 * get_more_blocks() expects us to describe a hole by clearing
598 * the mapped bit on bh_result().
599 *
600 * Consider an unwritten extent as a hole.
601 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno);
604 else
605 clear_buffer_mapped(bh_result);
606
607 /* make sure we don't map more than max_blocks blocks here as
608 that's all the kernel will handle at this point. */
609 if (max_blocks < contig_blocks)
610 contig_blocks = max_blocks;
611 bh_result->b_size = contig_blocks << blocksize_bits;
612bail:
613 if (alloc_locked)
614 ocfs2_inode_unlock(inode, 1);
615 return ret;
616}
617
618/*
619 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
620 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
621 * to protect io on one node from truncation on another.
622 */
623static void ocfs2_dio_end_io(struct kiocb *iocb,
624 loff_t offset,
625 ssize_t bytes,
626 void *private)
627{
628 struct inode *inode = file_inode(iocb->ki_filp);
629 int level;
630
631 /* this io's submitter should not have unlocked this before we could */
632 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
633
634 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
635 ocfs2_iocb_clear_unaligned_aio(iocb);
636
637 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
638 }
639
640 /* Let rw unlock to be done later to protect append direct io write */
641 if (offset + bytes <= i_size_read(inode)) {
642 ocfs2_iocb_clear_rw_locked(iocb);
643
644 level = ocfs2_iocb_rw_locked_level(iocb);
645 ocfs2_rw_unlock(inode, level);
646 }
647}
648
649static int ocfs2_releasepage(struct page *page, gfp_t wait) 502static int ocfs2_releasepage(struct page *page, gfp_t wait)
650{ 503{
651 if (!page_has_buffers(page)) 504 if (!page_has_buffers(page))
@@ -653,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
653 return try_to_free_buffers(page); 506 return try_to_free_buffers(page);
654} 507}
655 508
656static int ocfs2_is_overwrite(struct ocfs2_super *osb,
657 struct inode *inode, loff_t offset)
658{
659 int ret = 0;
660 u32 v_cpos = 0;
661 u32 p_cpos = 0;
662 unsigned int num_clusters = 0;
663 unsigned int ext_flags = 0;
664
665 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
666 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
667 &num_clusters, &ext_flags);
668 if (ret < 0) {
669 mlog_errno(ret);
670 return ret;
671 }
672
673 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
674 return 1;
675
676 return 0;
677}
678
679static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
680 struct inode *inode, loff_t offset,
681 u64 zero_len, int cluster_align)
682{
683 u32 p_cpos = 0;
684 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
685 unsigned int num_clusters = 0;
686 unsigned int ext_flags = 0;
687 int ret = 0;
688
689 if (offset <= i_size_read(inode) || cluster_align)
690 return 0;
691
692 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
693 &ext_flags);
694 if (ret < 0) {
695 mlog_errno(ret);
696 return ret;
697 }
698
699 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
700 u64 s = i_size_read(inode);
701 sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
702 (do_div(s, osb->s_clustersize) >> 9);
703
704 ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
705 zero_len >> 9, GFP_NOFS, false);
706 if (ret < 0)
707 mlog_errno(ret);
708 }
709
710 return ret;
711}
712
713static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
714 struct inode *inode, loff_t offset)
715{
716 u64 zero_start, zero_len, total_zero_len;
717 u32 p_cpos = 0, clusters_to_add;
718 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
719 unsigned int num_clusters = 0;
720 unsigned int ext_flags = 0;
721 u32 size_div, offset_div;
722 int ret = 0;
723
724 {
725 u64 o = offset;
726 u64 s = i_size_read(inode);
727
728 offset_div = do_div(o, osb->s_clustersize);
729 size_div = do_div(s, osb->s_clustersize);
730 }
731
732 if (offset <= i_size_read(inode))
733 return 0;
734
735 clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
736 ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
737 total_zero_len = offset - i_size_read(inode);
738 if (clusters_to_add)
739 total_zero_len -= offset_div;
740
741 /* Allocate clusters to fill out holes, and this is only needed
742 * when we add more than one clusters. Otherwise the cluster will
743 * be allocated during direct IO */
744 if (clusters_to_add > 1) {
745 ret = ocfs2_extend_allocation(inode,
746 OCFS2_I(inode)->ip_clusters,
747 clusters_to_add - 1, 0);
748 if (ret) {
749 mlog_errno(ret);
750 goto out;
751 }
752 }
753
754 while (total_zero_len) {
755 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
756 &ext_flags);
757 if (ret < 0) {
758 mlog_errno(ret);
759 goto out;
760 }
761
762 zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
763 size_div;
764 zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
765 size_div;
766 zero_len = min(total_zero_len, zero_len);
767
768 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
769 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
770 zero_start >> 9, zero_len >> 9,
771 GFP_NOFS, false);
772 if (ret < 0) {
773 mlog_errno(ret);
774 goto out;
775 }
776 }
777
778 total_zero_len -= zero_len;
779 v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
780
781 /* Only at first iteration can be cluster not aligned.
782 * So set size_div to 0 for the rest */
783 size_div = 0;
784 }
785
786out:
787 return ret;
788}
789
790static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
791 struct iov_iter *iter,
792 loff_t offset)
793{
794 ssize_t ret = 0;
795 ssize_t written = 0;
796 bool orphaned = false;
797 int is_overwrite = 0;
798 struct file *file = iocb->ki_filp;
799 struct inode *inode = file_inode(file)->i_mapping->host;
800 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
801 struct buffer_head *di_bh = NULL;
802 size_t count = iter->count;
803 journal_t *journal = osb->journal->j_journal;
804 u64 zero_len_head, zero_len_tail;
805 int cluster_align_head, cluster_align_tail;
806 loff_t final_size = offset + count;
807 int append_write = offset >= i_size_read(inode) ? 1 : 0;
808 unsigned int num_clusters = 0;
809 unsigned int ext_flags = 0;
810
811 {
812 u64 o = offset;
813 u64 s = i_size_read(inode);
814
815 zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
816 cluster_align_head = !zero_len_head;
817
818 zero_len_tail = osb->s_clustersize -
819 do_div(s, osb->s_clustersize);
820 if ((offset - i_size_read(inode)) < zero_len_tail)
821 zero_len_tail = offset - i_size_read(inode);
822 cluster_align_tail = !zero_len_tail;
823 }
824
825 /*
826 * when final_size > inode->i_size, inode->i_size will be
827 * updated after direct write, so add the inode to orphan
828 * dir first.
829 */
830 if (final_size > i_size_read(inode)) {
831 ret = ocfs2_add_inode_to_orphan(osb, inode);
832 if (ret < 0) {
833 mlog_errno(ret);
834 goto out;
835 }
836 orphaned = true;
837 }
838
839 if (append_write) {
840 ret = ocfs2_inode_lock(inode, NULL, 1);
841 if (ret < 0) {
842 mlog_errno(ret);
843 goto clean_orphan;
844 }
845
846 /* zeroing out the previously allocated cluster tail
847 * that but not zeroed */
848 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
849 down_read(&OCFS2_I(inode)->ip_alloc_sem);
850 ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
851 zero_len_tail, cluster_align_tail);
852 up_read(&OCFS2_I(inode)->ip_alloc_sem);
853 } else {
854 down_write(&OCFS2_I(inode)->ip_alloc_sem);
855 ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
856 offset);
857 up_write(&OCFS2_I(inode)->ip_alloc_sem);
858 }
859 if (ret < 0) {
860 mlog_errno(ret);
861 ocfs2_inode_unlock(inode, 1);
862 goto clean_orphan;
863 }
864
865 is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
866 if (is_overwrite < 0) {
867 mlog_errno(is_overwrite);
868 ret = is_overwrite;
869 ocfs2_inode_unlock(inode, 1);
870 goto clean_orphan;
871 }
872
873 ocfs2_inode_unlock(inode, 1);
874 }
875
876 written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
877 offset, ocfs2_direct_IO_get_blocks,
878 ocfs2_dio_end_io, NULL, 0);
879 /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
880 if ((written < 0) && (written != -EIOCBQUEUED)) {
881 loff_t i_size = i_size_read(inode);
882
883 if (offset + count > i_size) {
884 ret = ocfs2_inode_lock(inode, &di_bh, 1);
885 if (ret < 0) {
886 mlog_errno(ret);
887 goto clean_orphan;
888 }
889
890 if (i_size == i_size_read(inode)) {
891 ret = ocfs2_truncate_file(inode, di_bh,
892 i_size);
893 if (ret < 0) {
894 if (ret != -ENOSPC)
895 mlog_errno(ret);
896
897 ocfs2_inode_unlock(inode, 1);
898 brelse(di_bh);
899 di_bh = NULL;
900 goto clean_orphan;
901 }
902 }
903
904 ocfs2_inode_unlock(inode, 1);
905 brelse(di_bh);
906 di_bh = NULL;
907
908 ret = jbd2_journal_force_commit(journal);
909 if (ret < 0)
910 mlog_errno(ret);
911 }
912 } else if (written > 0 && append_write && !is_overwrite &&
913 !cluster_align_head) {
914 /* zeroing out the allocated cluster head */
915 u32 p_cpos = 0;
916 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
917
918 ret = ocfs2_inode_lock(inode, NULL, 0);
919 if (ret < 0) {
920 mlog_errno(ret);
921 goto clean_orphan;
922 }
923
924 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
925 &num_clusters, &ext_flags);
926 if (ret < 0) {
927 mlog_errno(ret);
928 ocfs2_inode_unlock(inode, 0);
929 goto clean_orphan;
930 }
931
932 BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
933
934 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
935 (u64)p_cpos << (osb->s_clustersize_bits - 9),
936 zero_len_head >> 9, GFP_NOFS, false);
937 if (ret < 0)
938 mlog_errno(ret);
939
940 ocfs2_inode_unlock(inode, 0);
941 }
942
943clean_orphan:
944 if (orphaned) {
945 int tmp_ret;
946 int update_isize = written > 0 ? 1 : 0;
947 loff_t end = update_isize ? offset + written : 0;
948
949 tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
950 if (tmp_ret < 0) {
951 ret = tmp_ret;
952 mlog_errno(ret);
953 goto out;
954 }
955
956 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
957 update_isize, end);
958 if (tmp_ret < 0) {
959 ocfs2_inode_unlock(inode, 1);
960 ret = tmp_ret;
961 mlog_errno(ret);
962 brelse(di_bh);
963 goto out;
964 }
965
966 ocfs2_inode_unlock(inode, 1);
967 brelse(di_bh);
968
969 tmp_ret = jbd2_journal_force_commit(journal);
970 if (tmp_ret < 0) {
971 ret = tmp_ret;
972 mlog_errno(tmp_ret);
973 }
974 }
975
976out:
977 if (ret >= 0)
978 ret = written;
979 return ret;
980}
981
982static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
983 loff_t offset)
984{
985 struct file *file = iocb->ki_filp;
986 struct inode *inode = file_inode(file)->i_mapping->host;
987 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
988 int full_coherency = !(osb->s_mount_opt &
989 OCFS2_MOUNT_COHERENCY_BUFFERED);
990
991 /*
992 * Fallback to buffered I/O if we see an inode without
993 * extents.
994 */
995 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
996 return 0;
997
998 /* Fallback to buffered I/O if we are appending and
999 * concurrent O_DIRECT writes are allowed.
1000 */
1001 if (i_size_read(inode) <= offset && !full_coherency)
1002 return 0;
1003
1004 if (iov_iter_rw(iter) == READ)
1005 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
1006 iter, offset,
1007 ocfs2_direct_IO_get_blocks,
1008 ocfs2_dio_end_io, NULL, 0);
1009 else
1010 return ocfs2_direct_IO_write(iocb, iter, offset);
1011}
1012
1013static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 509static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
1014 u32 cpos, 510 u32 cpos,
1015 unsigned int *start, 511 unsigned int *start,
@@ -1196,6 +692,13 @@ next_bh:
1196 692
1197#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) 693#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
1198 694
695struct ocfs2_unwritten_extent {
696 struct list_head ue_node;
697 struct list_head ue_ip_node;
698 u32 ue_cpos;
699 u32 ue_phys;
700};
701
1199/* 702/*
1200 * Describe the state of a single cluster to be written to. 703 * Describe the state of a single cluster to be written to.
1201 */ 704 */
@@ -1207,7 +710,7 @@ struct ocfs2_write_cluster_desc {
1207 * filled. 710 * filled.
1208 */ 711 */
1209 unsigned c_new; 712 unsigned c_new;
1210 unsigned c_unwritten; 713 unsigned c_clear_unwritten;
1211 unsigned c_needs_zero; 714 unsigned c_needs_zero;
1212}; 715};
1213 716
@@ -1219,6 +722,9 @@ struct ocfs2_write_ctxt {
1219 /* First cluster allocated in a nonsparse extend */ 722 /* First cluster allocated in a nonsparse extend */
1220 u32 w_first_new_cpos; 723 u32 w_first_new_cpos;
1221 724
725 /* Type of caller. Must be one of buffer, mmap, direct. */
726 ocfs2_write_type_t w_type;
727
1222 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 728 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
1223 729
1224 /* 730 /*
@@ -1267,6 +773,8 @@ struct ocfs2_write_ctxt {
1267 struct buffer_head *w_di_bh; 773 struct buffer_head *w_di_bh;
1268 774
1269 struct ocfs2_cached_dealloc_ctxt w_dealloc; 775 struct ocfs2_cached_dealloc_ctxt w_dealloc;
776
777 struct list_head w_unwritten_list;
1270}; 778};
1271 779
1272void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) 780void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1305,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
1305 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 813 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
1306} 814}
1307 815
1308static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 816static void ocfs2_free_unwritten_list(struct inode *inode,
817 struct list_head *head)
1309{ 818{
819 struct ocfs2_inode_info *oi = OCFS2_I(inode);
820 struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
821
822 list_for_each_entry_safe(ue, tmp, head, ue_node) {
823 list_del(&ue->ue_node);
824 spin_lock(&oi->ip_lock);
825 list_del(&ue->ue_ip_node);
826 spin_unlock(&oi->ip_lock);
827 kfree(ue);
828 }
829}
830
831static void ocfs2_free_write_ctxt(struct inode *inode,
832 struct ocfs2_write_ctxt *wc)
833{
834 ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
1310 ocfs2_unlock_pages(wc); 835 ocfs2_unlock_pages(wc);
1311 brelse(wc->w_di_bh); 836 brelse(wc->w_di_bh);
1312 kfree(wc); 837 kfree(wc);
@@ -1314,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
1314 839
1315static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, 840static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1316 struct ocfs2_super *osb, loff_t pos, 841 struct ocfs2_super *osb, loff_t pos,
1317 unsigned len, struct buffer_head *di_bh) 842 unsigned len, ocfs2_write_type_t type,
843 struct buffer_head *di_bh)
1318{ 844{
1319 u32 cend; 845 u32 cend;
1320 struct ocfs2_write_ctxt *wc; 846 struct ocfs2_write_ctxt *wc;
@@ -1329,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1329 wc->w_clen = cend - wc->w_cpos + 1; 855 wc->w_clen = cend - wc->w_cpos + 1;
1330 get_bh(di_bh); 856 get_bh(di_bh);
1331 wc->w_di_bh = di_bh; 857 wc->w_di_bh = di_bh;
858 wc->w_type = type;
1332 859
1333 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 860 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1334 wc->w_large_pages = 1; 861 wc->w_large_pages = 1;
@@ -1336,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1336 wc->w_large_pages = 0; 863 wc->w_large_pages = 0;
1337 864
1338 ocfs2_init_dealloc_ctxt(&wc->w_dealloc); 865 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
866 INIT_LIST_HEAD(&wc->w_unwritten_list);
1339 867
1340 *wcp = wc; 868 *wcp = wc;
1341 869
@@ -1396,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
1396 to = user_pos + user_len; 924 to = user_pos + user_len;
1397 struct page *tmppage; 925 struct page *tmppage;
1398 926
1399 ocfs2_zero_new_buffers(wc->w_target_page, from, to); 927 if (wc->w_target_page)
928 ocfs2_zero_new_buffers(wc->w_target_page, from, to);
1400 929
1401 for(i = 0; i < wc->w_num_pages; i++) { 930 for(i = 0; i < wc->w_num_pages; i++) {
1402 tmppage = wc->w_pages[i]; 931 tmppage = wc->w_pages[i];
1403 932
1404 if (page_has_buffers(tmppage)) { 933 if (tmppage && page_has_buffers(tmppage)) {
1405 if (ocfs2_should_order_data(inode)) 934 if (ocfs2_should_order_data(inode))
1406 ocfs2_jbd2_file_inode(wc->w_handle, inode); 935 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1407 936
@@ -1531,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1531 wc->w_num_pages = 1; 1060 wc->w_num_pages = 1;
1532 start = target_index; 1061 start = target_index;
1533 } 1062 }
1063 end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
1534 1064
1535 for(i = 0; i < wc->w_num_pages; i++) { 1065 for(i = 0; i < wc->w_num_pages; i++) {
1536 index = start + i; 1066 index = start + i;
1537 1067
1538 if (index == target_index && mmap_page) { 1068 if (index >= target_index && index <= end_index &&
1069 wc->w_type == OCFS2_WRITE_MMAP) {
1539 /* 1070 /*
1540 * ocfs2_pagemkwrite() is a little different 1071 * ocfs2_pagemkwrite() is a little different
1541 * and wants us to directly use the page 1072 * and wants us to directly use the page
@@ -1554,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1554 page_cache_get(mmap_page); 1085 page_cache_get(mmap_page);
1555 wc->w_pages[i] = mmap_page; 1086 wc->w_pages[i] = mmap_page;
1556 wc->w_target_locked = true; 1087 wc->w_target_locked = true;
1088 } else if (index >= target_index && index <= end_index &&
1089 wc->w_type == OCFS2_WRITE_DIRECT) {
1090 /* Direct write has no mapping page. */
1091 wc->w_pages[i] = NULL;
1092 continue;
1557 } else { 1093 } else {
1558 wc->w_pages[i] = find_or_create_page(mapping, index, 1094 wc->w_pages[i] = find_or_create_page(mapping, index,
1559 GFP_NOFS); 1095 GFP_NOFS);
@@ -1578,19 +1114,20 @@ out:
1578 * Prepare a single cluster for write one cluster into the file. 1114 * Prepare a single cluster for write one cluster into the file.
1579 */ 1115 */
1580static int ocfs2_write_cluster(struct address_space *mapping, 1116static int ocfs2_write_cluster(struct address_space *mapping,
1581 u32 phys, unsigned int unwritten, 1117 u32 *phys, unsigned int new,
1118 unsigned int clear_unwritten,
1582 unsigned int should_zero, 1119 unsigned int should_zero,
1583 struct ocfs2_alloc_context *data_ac, 1120 struct ocfs2_alloc_context *data_ac,
1584 struct ocfs2_alloc_context *meta_ac, 1121 struct ocfs2_alloc_context *meta_ac,
1585 struct ocfs2_write_ctxt *wc, u32 cpos, 1122 struct ocfs2_write_ctxt *wc, u32 cpos,
1586 loff_t user_pos, unsigned user_len) 1123 loff_t user_pos, unsigned user_len)
1587{ 1124{
1588 int ret, i, new; 1125 int ret, i;
1589 u64 v_blkno, p_blkno; 1126 u64 p_blkno;
1590 struct inode *inode = mapping->host; 1127 struct inode *inode = mapping->host;
1591 struct ocfs2_extent_tree et; 1128 struct ocfs2_extent_tree et;
1129 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
1592 1130
1593 new = phys == 0 ? 1 : 0;
1594 if (new) { 1131 if (new) {
1595 u32 tmp_pos; 1132 u32 tmp_pos;
1596 1133
@@ -1600,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1600 */ 1137 */
1601 tmp_pos = cpos; 1138 tmp_pos = cpos;
1602 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, 1139 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1603 &tmp_pos, 1, 0, wc->w_di_bh, 1140 &tmp_pos, 1, !clear_unwritten,
1604 wc->w_handle, data_ac, 1141 wc->w_di_bh, wc->w_handle,
1605 meta_ac, NULL); 1142 data_ac, meta_ac, NULL);
1606 /* 1143 /*
1607 * This shouldn't happen because we must have already 1144 * This shouldn't happen because we must have already
1608 * calculated the correct meta data allocation required. The 1145 * calculated the correct meta data allocation required. The
@@ -1619,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1619 mlog_errno(ret); 1156 mlog_errno(ret);
1620 goto out; 1157 goto out;
1621 } 1158 }
1622 } else if (unwritten) { 1159 } else if (clear_unwritten) {
1623 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), 1160 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1624 wc->w_di_bh); 1161 wc->w_di_bh);
1625 ret = ocfs2_mark_extent_written(inode, &et, 1162 ret = ocfs2_mark_extent_written(inode, &et,
1626 wc->w_handle, cpos, 1, phys, 1163 wc->w_handle, cpos, 1, *phys,
1627 meta_ac, &wc->w_dealloc); 1164 meta_ac, &wc->w_dealloc);
1628 if (ret < 0) { 1165 if (ret < 0) {
1629 mlog_errno(ret); 1166 mlog_errno(ret);
@@ -1631,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1631 } 1168 }
1632 } 1169 }
1633 1170
1634 if (should_zero)
1635 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1636 else
1637 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1638
1639 /* 1171 /*
1640 * The only reason this should fail is due to an inability to 1172 * The only reason this should fail is due to an inability to
1641 * find the extent added. 1173 * find the extent added.
1642 */ 1174 */
1643 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1175 ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
1644 NULL);
1645 if (ret < 0) { 1176 if (ret < 0) {
1646 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " 1177 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
1647 "at logical block %llu", 1178 "at logical cluster %u",
1648 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1179 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
1649 (unsigned long long)v_blkno);
1650 goto out; 1180 goto out;
1651 } 1181 }
1652 1182
1653 BUG_ON(p_blkno == 0); 1183 BUG_ON(*phys == 0);
1184
1185 p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
1186 if (!should_zero)
1187 p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
1654 1188
1655 for(i = 0; i < wc->w_num_pages; i++) { 1189 for(i = 0; i < wc->w_num_pages; i++) {
1656 int tmpret; 1190 int tmpret;
1657 1191
1192 /* This is the direct io target page. */
1193 if (wc->w_pages[i] == NULL) {
1194 p_blkno++;
1195 continue;
1196 }
1197
1658 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, 1198 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1659 wc->w_pages[i], cpos, 1199 wc->w_pages[i], cpos,
1660 user_pos, user_len, 1200 user_pos, user_len,
@@ -1701,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1701 if ((cluster_off + local_len) > osb->s_clustersize) 1241 if ((cluster_off + local_len) > osb->s_clustersize)
1702 local_len = osb->s_clustersize - cluster_off; 1242 local_len = osb->s_clustersize - cluster_off;
1703 1243
1704 ret = ocfs2_write_cluster(mapping, desc->c_phys, 1244 ret = ocfs2_write_cluster(mapping, &desc->c_phys,
1705 desc->c_unwritten, 1245 desc->c_new,
1246 desc->c_clear_unwritten,
1706 desc->c_needs_zero, 1247 desc->c_needs_zero,
1707 data_ac, meta_ac, 1248 data_ac, meta_ac,
1708 wc, desc->c_cpos, pos, local_len); 1249 wc, desc->c_cpos, pos, local_len);
@@ -1773,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1773} 1314}
1774 1315
1775/* 1316/*
1317 * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
1318 * do the zero work. And should not to clear UNWRITTEN since it will be cleared
1319 * by the direct io procedure.
1320 * If this is a new extent that allocated by direct io, we should mark it in
1321 * the ip_unwritten_list.
1322 */
1323static int ocfs2_unwritten_check(struct inode *inode,
1324 struct ocfs2_write_ctxt *wc,
1325 struct ocfs2_write_cluster_desc *desc)
1326{
1327 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1328 struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
1329 int ret = 0;
1330
1331 if (!desc->c_needs_zero)
1332 return 0;
1333
1334retry:
1335 spin_lock(&oi->ip_lock);
1336 /* Needs not to zero no metter buffer or direct. The one who is zero
1337 * the cluster is doing zero. And he will clear unwritten after all
1338 * cluster io finished. */
1339 list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
1340 if (desc->c_cpos == ue->ue_cpos) {
1341 BUG_ON(desc->c_new);
1342 desc->c_needs_zero = 0;
1343 desc->c_clear_unwritten = 0;
1344 goto unlock;
1345 }
1346 }
1347
1348 if (wc->w_type != OCFS2_WRITE_DIRECT)
1349 goto unlock;
1350
1351 if (new == NULL) {
1352 spin_unlock(&oi->ip_lock);
1353 new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
1354 GFP_NOFS);
1355 if (new == NULL) {
1356 ret = -ENOMEM;
1357 goto out;
1358 }
1359 goto retry;
1360 }
1361 /* This direct write will doing zero. */
1362 new->ue_cpos = desc->c_cpos;
1363 new->ue_phys = desc->c_phys;
1364 desc->c_clear_unwritten = 0;
1365 list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
1366 list_add_tail(&new->ue_node, &wc->w_unwritten_list);
1367 new = NULL;
1368unlock:
1369 spin_unlock(&oi->ip_lock);
1370out:
1371 if (new)
1372 kfree(new);
1373 return ret;
1374}
1375
1376/*
1776 * Populate each single-cluster write descriptor in the write context 1377 * Populate each single-cluster write descriptor in the write context
1777 * with information about the i/o to be done. 1378 * with information about the i/o to be done.
1778 * 1379 *
@@ -1847,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1847 if (phys == 0) { 1448 if (phys == 0) {
1848 desc->c_new = 1; 1449 desc->c_new = 1;
1849 desc->c_needs_zero = 1; 1450 desc->c_needs_zero = 1;
1451 desc->c_clear_unwritten = 1;
1850 *clusters_to_alloc = *clusters_to_alloc + 1; 1452 *clusters_to_alloc = *clusters_to_alloc + 1;
1851 } 1453 }
1852 1454
1853 if (ext_flags & OCFS2_EXT_UNWRITTEN) { 1455 if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1854 desc->c_unwritten = 1; 1456 desc->c_clear_unwritten = 1;
1855 desc->c_needs_zero = 1; 1457 desc->c_needs_zero = 1;
1856 } 1458 }
1857 1459
1460 ret = ocfs2_unwritten_check(inode, wc, desc);
1461 if (ret) {
1462 mlog_errno(ret);
1463 goto out;
1464 }
1465
1858 num_clusters--; 1466 num_clusters--;
1859 } 1467 }
1860 1468
@@ -2017,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
2017 if (ret) 1625 if (ret)
2018 mlog_errno(ret); 1626 mlog_errno(ret);
2019 1627
2020 wc->w_first_new_cpos = 1628 /* There is no wc if this is call from direct. */
2021 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); 1629 if (wc)
1630 wc->w_first_new_cpos =
1631 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
2022 1632
2023 return ret; 1633 return ret;
2024} 1634}
@@ -2072,9 +1682,8 @@ out:
2072 return ret; 1682 return ret;
2073} 1683}
2074 1684
2075int ocfs2_write_begin_nolock(struct file *filp, 1685int ocfs2_write_begin_nolock(struct address_space *mapping,
2076 struct address_space *mapping, 1686 loff_t pos, unsigned len, ocfs2_write_type_t type,
2077 loff_t pos, unsigned len, unsigned flags,
2078 struct page **pagep, void **fsdata, 1687 struct page **pagep, void **fsdata,
2079 struct buffer_head *di_bh, struct page *mmap_page) 1688 struct buffer_head *di_bh, struct page *mmap_page)
2080{ 1689{
@@ -2091,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
2091 int try_free = 1, ret1; 1700 int try_free = 1, ret1;
2092 1701
2093try_again: 1702try_again:
2094 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1703 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
2095 if (ret) { 1704 if (ret) {
2096 mlog_errno(ret); 1705 mlog_errno(ret);
2097 return ret; 1706 return ret;
@@ -2110,14 +1719,17 @@ try_again:
2110 } 1719 }
2111 } 1720 }
2112 1721
2113 if (ocfs2_sparse_alloc(osb)) 1722 /* Direct io change i_size late, should not zero tail here. */
2114 ret = ocfs2_zero_tail(inode, di_bh, pos); 1723 if (type != OCFS2_WRITE_DIRECT) {
2115 else 1724 if (ocfs2_sparse_alloc(osb))
2116 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, 1725 ret = ocfs2_zero_tail(inode, di_bh, pos);
2117 wc); 1726 else
2118 if (ret) { 1727 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2119 mlog_errno(ret); 1728 len, wc);
2120 goto out; 1729 if (ret) {
1730 mlog_errno(ret);
1731 goto out;
1732 }
2121 } 1733 }
2122 1734
2123 ret = ocfs2_check_range_for_refcount(inode, pos, len); 1735 ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2148,7 +1760,7 @@ try_again:
2148 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1760 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2149 (long long)i_size_read(inode), 1761 (long long)i_size_read(inode),
2150 le32_to_cpu(di->i_clusters), 1762 le32_to_cpu(di->i_clusters),
2151 pos, len, flags, mmap_page, 1763 pos, len, type, mmap_page,
2152 clusters_to_alloc, extents_to_split); 1764 clusters_to_alloc, extents_to_split);
2153 1765
2154 /* 1766 /*
@@ -2178,17 +1790,17 @@ try_again:
2178 1790
2179 credits = ocfs2_calc_extend_credits(inode->i_sb, 1791 credits = ocfs2_calc_extend_credits(inode->i_sb,
2180 &di->id2.i_list); 1792 &di->id2.i_list);
2181 1793 } else if (type == OCFS2_WRITE_DIRECT)
2182 } 1794 /* direct write needs not to start trans if no extents alloc. */
1795 goto success;
2183 1796
2184 /* 1797 /*
2185 * We have to zero sparse allocated clusters, unwritten extent clusters, 1798 * We have to zero sparse allocated clusters, unwritten extent clusters,
2186 * and non-sparse clusters we just extended. For non-sparse writes, 1799 * and non-sparse clusters we just extended. For non-sparse writes,
2187 * we know zeros will only be needed in the first and/or last cluster. 1800 * we know zeros will only be needed in the first and/or last cluster.
2188 */ 1801 */
2189 if (clusters_to_alloc || extents_to_split || 1802 if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
2190 (wc->w_clen && (wc->w_desc[0].c_needs_zero || 1803 wc->w_desc[wc->w_clen - 1].c_needs_zero))
2191 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
2192 cluster_of_pages = 1; 1804 cluster_of_pages = 1;
2193 else 1805 else
2194 cluster_of_pages = 0; 1806 cluster_of_pages = 0;
@@ -2255,7 +1867,8 @@ try_again:
2255 ocfs2_free_alloc_context(meta_ac); 1867 ocfs2_free_alloc_context(meta_ac);
2256 1868
2257success: 1869success:
2258 *pagep = wc->w_target_page; 1870 if (pagep)
1871 *pagep = wc->w_target_page;
2259 *fsdata = wc; 1872 *fsdata = wc;
2260 return 0; 1873 return 0;
2261out_quota: 1874out_quota:
@@ -2266,7 +1879,7 @@ out_commit:
2266 ocfs2_commit_trans(osb, handle); 1879 ocfs2_commit_trans(osb, handle);
2267 1880
2268out: 1881out:
2269 ocfs2_free_write_ctxt(wc); 1882 ocfs2_free_write_ctxt(inode, wc);
2270 1883
2271 if (data_ac) { 1884 if (data_ac) {
2272 ocfs2_free_alloc_context(data_ac); 1885 ocfs2_free_alloc_context(data_ac);
@@ -2318,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
2318 */ 1931 */
2319 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1932 down_write(&OCFS2_I(inode)->ip_alloc_sem);
2320 1933
2321 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, 1934 ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
2322 fsdata, di_bh, NULL); 1935 pagep, fsdata, di_bh, NULL);
2323 if (ret) { 1936 if (ret) {
2324 mlog_errno(ret); 1937 mlog_errno(ret);
2325 goto out_fail; 1938 goto out_fail;
@@ -2376,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2376 handle_t *handle = wc->w_handle; 1989 handle_t *handle = wc->w_handle;
2377 struct page *tmppage; 1990 struct page *tmppage;
2378 1991
2379 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1992 BUG_ON(!list_empty(&wc->w_unwritten_list));
2380 OCFS2_JOURNAL_ACCESS_WRITE); 1993
2381 if (ret) { 1994 if (handle) {
2382 copied = ret; 1995 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
2383 mlog_errno(ret); 1996 wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2384 goto out; 1997 if (ret) {
1998 copied = ret;
1999 mlog_errno(ret);
2000 goto out;
2001 }
2385 } 2002 }
2386 2003
2387 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2004 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2389,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2389 goto out_write_size; 2006 goto out_write_size;
2390 } 2007 }
2391 2008
2392 if (unlikely(copied < len)) { 2009 if (unlikely(copied < len) && wc->w_target_page) {
2393 if (!PageUptodate(wc->w_target_page)) 2010 if (!PageUptodate(wc->w_target_page))
2394 copied = 0; 2011 copied = 0;
2395 2012
2396 ocfs2_zero_new_buffers(wc->w_target_page, start+copied, 2013 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
2397 start+len); 2014 start+len);
2398 } 2015 }
2399 flush_dcache_page(wc->w_target_page); 2016 if (wc->w_target_page)
2017 flush_dcache_page(wc->w_target_page);
2400 2018
2401 for(i = 0; i < wc->w_num_pages; i++) { 2019 for(i = 0; i < wc->w_num_pages; i++) {
2402 tmppage = wc->w_pages[i]; 2020 tmppage = wc->w_pages[i];
2403 2021
2022 /* This is the direct io target page. */
2023 if (tmppage == NULL)
2024 continue;
2025
2404 if (tmppage == wc->w_target_page) { 2026 if (tmppage == wc->w_target_page) {
2405 from = wc->w_target_from; 2027 from = wc->w_target_from;
2406 to = wc->w_target_to; 2028 to = wc->w_target_to;
@@ -2419,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2419 } 2041 }
2420 2042
2421 if (page_has_buffers(tmppage)) { 2043 if (page_has_buffers(tmppage)) {
2422 if (ocfs2_should_order_data(inode)) 2044 if (handle && ocfs2_should_order_data(inode))
2423 ocfs2_jbd2_file_inode(wc->w_handle, inode); 2045 ocfs2_jbd2_file_inode(handle, inode);
2424 block_commit_write(tmppage, from, to); 2046 block_commit_write(tmppage, from, to);
2425 } 2047 }
2426 } 2048 }
2427 2049
2428out_write_size: 2050out_write_size:
2429 pos += copied; 2051 /* Direct io do not update i_size here. */
2430 if (pos > i_size_read(inode)) { 2052 if (wc->w_type != OCFS2_WRITE_DIRECT) {
2431 i_size_write(inode, pos); 2053 pos += copied;
2432 mark_inode_dirty(inode); 2054 if (pos > i_size_read(inode)) {
2433 } 2055 i_size_write(inode, pos);
2434 inode->i_blocks = ocfs2_inode_sector_count(inode); 2056 mark_inode_dirty(inode);
2435 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 2057 }
2436 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2058 inode->i_blocks = ocfs2_inode_sector_count(inode);
2437 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2059 di->i_size = cpu_to_le64((u64)i_size_read(inode));
2438 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2060 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2439 ocfs2_update_inode_fsync_trans(handle, inode, 1); 2061 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2440 ocfs2_journal_dirty(handle, wc->w_di_bh); 2062 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2063 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2064 }
2065 if (handle)
2066 ocfs2_journal_dirty(handle, wc->w_di_bh);
2441 2067
2442out: 2068out:
2443 /* unlock pages before dealloc since it needs acquiring j_trans_barrier 2069 /* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2447,7 +2073,8 @@ out:
2447 */ 2073 */
2448 ocfs2_unlock_pages(wc); 2074 ocfs2_unlock_pages(wc);
2449 2075
2450 ocfs2_commit_trans(osb, handle); 2076 if (handle)
2077 ocfs2_commit_trans(osb, handle);
2451 2078
2452 ocfs2_run_deallocs(osb, &wc->w_dealloc); 2079 ocfs2_run_deallocs(osb, &wc->w_dealloc);
2453 2080
@@ -2472,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
2472 return ret; 2099 return ret;
2473} 2100}
2474 2101
2102struct ocfs2_dio_write_ctxt {
2103 struct list_head dw_zero_list;
2104 unsigned dw_zero_count;
2105 int dw_orphaned;
2106 pid_t dw_writer_pid;
2107};
2108
2109static struct ocfs2_dio_write_ctxt *
2110ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
2111{
2112 struct ocfs2_dio_write_ctxt *dwc = NULL;
2113
2114 if (bh->b_private)
2115 return bh->b_private;
2116
2117 dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
2118 if (dwc == NULL)
2119 return NULL;
2120 INIT_LIST_HEAD(&dwc->dw_zero_list);
2121 dwc->dw_zero_count = 0;
2122 dwc->dw_orphaned = 0;
2123 dwc->dw_writer_pid = task_pid_nr(current);
2124 bh->b_private = dwc;
2125 *alloc = 1;
2126
2127 return dwc;
2128}
2129
2130static void ocfs2_dio_free_write_ctx(struct inode *inode,
2131 struct ocfs2_dio_write_ctxt *dwc)
2132{
2133 ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
2134 kfree(dwc);
2135}
2136
2137/*
2138 * TODO: Make this into a generic get_blocks function.
2139 *
2140 * From do_direct_io in direct-io.c:
2141 * "So what we do is to permit the ->get_blocks function to populate
2142 * bh.b_size with the size of IO which is permitted at this offset and
2143 * this i_blkbits."
2144 *
2145 * This function is called directly from get_more_blocks in direct-io.c.
2146 *
2147 * called like this: dio->get_blocks(dio->inode, fs_startblk,
2148 * fs_count, map_bh, dio->rw == WRITE);
2149 */
2150static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
2151 struct buffer_head *bh_result, int create)
2152{
2153 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2154 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2155 struct ocfs2_write_ctxt *wc;
2156 struct ocfs2_write_cluster_desc *desc = NULL;
2157 struct ocfs2_dio_write_ctxt *dwc = NULL;
2158 struct buffer_head *di_bh = NULL;
2159 u64 p_blkno;
2160 loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
2161 unsigned len, total_len = bh_result->b_size;
2162 int ret = 0, first_get_block = 0;
2163
2164 len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
2165 len = min(total_len, len);
2166
2167 mlog(0, "get block of %lu at %llu:%u req %u\n",
2168 inode->i_ino, pos, len, total_len);
2169
2170 /*
2171 * Because we need to change file size in ocfs2_dio_end_io_write(), or
2172 * we may need to add it to orphan dir. So can not fall to fast path
2173 * while file size will be changed.
2174 */
2175 if (pos + total_len <= i_size_read(inode)) {
2176 down_read(&oi->ip_alloc_sem);
2177 /* This is the fast path for re-write. */
2178 ret = ocfs2_get_block(inode, iblock, bh_result, create);
2179
2180 up_read(&oi->ip_alloc_sem);
2181
2182 if (buffer_mapped(bh_result) &&
2183 !buffer_new(bh_result) &&
2184 ret == 0)
2185 goto out;
2186
2187 /* Clear state set by ocfs2_get_block. */
2188 bh_result->b_state = 0;
2189 }
2190
2191 dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
2192 if (unlikely(dwc == NULL)) {
2193 ret = -ENOMEM;
2194 mlog_errno(ret);
2195 goto out;
2196 }
2197
2198 if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
2199 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
2200 !dwc->dw_orphaned) {
2201 /*
2202 * when we are going to alloc extents beyond file size, add the
2203 * inode to orphan dir, so we can recall those spaces when
2204 * system crashed during write.
2205 */
2206 ret = ocfs2_add_inode_to_orphan(osb, inode);
2207 if (ret < 0) {
2208 mlog_errno(ret);
2209 goto out;
2210 }
2211 dwc->dw_orphaned = 1;
2212 }
2213
2214 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2215 if (ret) {
2216 mlog_errno(ret);
2217 goto out;
2218 }
2219
2220 down_write(&oi->ip_alloc_sem);
2221
2222 if (first_get_block) {
2223 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
2224 ret = ocfs2_zero_tail(inode, di_bh, pos);
2225 else
2226 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2227 total_len, NULL);
2228 if (ret < 0) {
2229 mlog_errno(ret);
2230 goto unlock;
2231 }
2232 }
2233
2234 ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
2235 OCFS2_WRITE_DIRECT, NULL,
2236 (void **)&wc, di_bh, NULL);
2237 if (ret) {
2238 mlog_errno(ret);
2239 goto unlock;
2240 }
2241
2242 desc = &wc->w_desc[0];
2243
2244 p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
2245 BUG_ON(p_blkno == 0);
2246 p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
2247
2248 map_bh(bh_result, inode->i_sb, p_blkno);
2249 bh_result->b_size = len;
2250 if (desc->c_needs_zero)
2251 set_buffer_new(bh_result);
2252
2253 /* May sleep in end_io. It should not happen in a irq context. So defer
2254 * it to dio work queue. */
2255 set_buffer_defer_completion(bh_result);
2256
2257 if (!list_empty(&wc->w_unwritten_list)) {
2258 struct ocfs2_unwritten_extent *ue = NULL;
2259
2260 ue = list_first_entry(&wc->w_unwritten_list,
2261 struct ocfs2_unwritten_extent,
2262 ue_node);
2263 BUG_ON(ue->ue_cpos != desc->c_cpos);
2264 /* The physical address may be 0, fill it. */
2265 ue->ue_phys = desc->c_phys;
2266
2267 list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
2268 dwc->dw_zero_count++;
2269 }
2270
2271 ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
2272 BUG_ON(ret != len);
2273 ret = 0;
2274unlock:
2275 up_write(&oi->ip_alloc_sem);
2276 ocfs2_inode_unlock(inode, 1);
2277 brelse(di_bh);
2278out:
2279 if (ret < 0)
2280 ret = -EIO;
2281 return ret;
2282}
2283
2284static void ocfs2_dio_end_io_write(struct inode *inode,
2285 struct ocfs2_dio_write_ctxt *dwc,
2286 loff_t offset,
2287 ssize_t bytes)
2288{
2289 struct ocfs2_cached_dealloc_ctxt dealloc;
2290 struct ocfs2_extent_tree et;
2291 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2292 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2293 struct ocfs2_unwritten_extent *ue = NULL;
2294 struct buffer_head *di_bh = NULL;
2295 struct ocfs2_dinode *di;
2296 struct ocfs2_alloc_context *data_ac = NULL;
2297 struct ocfs2_alloc_context *meta_ac = NULL;
2298 handle_t *handle = NULL;
2299 loff_t end = offset + bytes;
2300 int ret = 0, credits = 0, locked = 0;
2301
2302 ocfs2_init_dealloc_ctxt(&dealloc);
2303
2304 /* We do clear unwritten, delete orphan, change i_size here. If neither
2305 * of these happen, we can skip all this. */
2306 if (list_empty(&dwc->dw_zero_list) &&
2307 end <= i_size_read(inode) &&
2308 !dwc->dw_orphaned)
2309 goto out;
2310
2311 /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
2312 * are in that context. */
2313 if (dwc->dw_writer_pid != task_pid_nr(current)) {
2314 mutex_lock(&inode->i_mutex);
2315 locked = 1;
2316 }
2317
2318 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2319 if (ret < 0) {
2320 mlog_errno(ret);
2321 goto out;
2322 }
2323
2324 down_write(&oi->ip_alloc_sem);
2325
2326 /* Delete orphan before acquire i_mutex. */
2327 if (dwc->dw_orphaned) {
2328 BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
2329
2330 end = end > i_size_read(inode) ? end : 0;
2331
2332 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
2333 !!end, end);
2334 if (ret < 0)
2335 mlog_errno(ret);
2336 }
2337
2338 di = (struct ocfs2_dinode *)di_bh;
2339
2340 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
2341
2342 ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
2343 &data_ac, &meta_ac);
2344 if (ret) {
2345 mlog_errno(ret);
2346 goto unlock;
2347 }
2348
2349 credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
2350
2351 handle = ocfs2_start_trans(osb, credits);
2352 if (IS_ERR(handle)) {
2353 ret = PTR_ERR(handle);
2354 mlog_errno(ret);
2355 goto unlock;
2356 }
2357 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2358 OCFS2_JOURNAL_ACCESS_WRITE);
2359 if (ret) {
2360 mlog_errno(ret);
2361 goto commit;
2362 }
2363
2364 list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
2365 ret = ocfs2_mark_extent_written(inode, &et, handle,
2366 ue->ue_cpos, 1,
2367 ue->ue_phys,
2368 meta_ac, &dealloc);
2369 if (ret < 0) {
2370 mlog_errno(ret);
2371 break;
2372 }
2373 }
2374
2375 if (end > i_size_read(inode)) {
2376 ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
2377 if (ret < 0)
2378 mlog_errno(ret);
2379 }
2380commit:
2381 ocfs2_commit_trans(osb, handle);
2382unlock:
2383 up_write(&oi->ip_alloc_sem);
2384 ocfs2_inode_unlock(inode, 1);
2385 brelse(di_bh);
2386out:
2387 if (data_ac)
2388 ocfs2_free_alloc_context(data_ac);
2389 if (meta_ac)
2390 ocfs2_free_alloc_context(meta_ac);
2391 ocfs2_run_deallocs(osb, &dealloc);
2392 if (locked)
2393 mutex_unlock(&inode->i_mutex);
2394 ocfs2_dio_free_write_ctx(inode, dwc);
2395}
2396
2397/*
2398 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
2399 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
2400 * to protect io on one node from truncation on another.
2401 */
2402static int ocfs2_dio_end_io(struct kiocb *iocb,
2403 loff_t offset,
2404 ssize_t bytes,
2405 void *private)
2406{
2407 struct inode *inode = file_inode(iocb->ki_filp);
2408 int level;
2409
2410 if (bytes <= 0)
2411 return 0;
2412
2413 /* this io's submitter should not have unlocked this before we could */
2414 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
2415
2416 if (private)
2417 ocfs2_dio_end_io_write(inode, private, offset, bytes);
2418
2419 ocfs2_iocb_clear_rw_locked(iocb);
2420
2421 level = ocfs2_iocb_rw_locked_level(iocb);
2422 ocfs2_rw_unlock(inode, level);
2423 return 0;
2424}
2425
2426static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
2427 loff_t offset)
2428{
2429 struct file *file = iocb->ki_filp;
2430 struct inode *inode = file_inode(file)->i_mapping->host;
2431 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2432 loff_t end = offset + iter->count;
2433 get_block_t *get_block;
2434
2435 /*
2436 * Fallback to buffered I/O if we see an inode without
2437 * extents.
2438 */
2439 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2440 return 0;
2441
2442 /* Fallback to buffered I/O if we do not support append dio. */
2443 if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
2444 return 0;
2445
2446 if (iov_iter_rw(iter) == READ)
2447 get_block = ocfs2_get_block;
2448 else
2449 get_block = ocfs2_dio_get_block;
2450
2451 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
2452 iter, offset, get_block,
2453 ocfs2_dio_end_io, NULL, 0);
2454}
2455
2475const struct address_space_operations ocfs2_aops = { 2456const struct address_space_operations ocfs2_aops = {
2476 .readpage = ocfs2_readpage, 2457 .readpage = ocfs2_readpage,
2477 .readpages = ocfs2_readpages, 2458 .readpages = ocfs2_readpages,