diff options
author | Josef Bacik <josef@redhat.com> | 2010-05-23 11:07:21 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2010-05-25 10:34:57 -0400 |
commit | 11c65dccf70be9ace5dbd3906778e1a099b1fee1 (patch) | |
tree | 1289f139ddf652e39672374b6f9051994c21ce57 | |
parent | 4b46fce23349bfca781a32e2707a18328ca5ae22 (diff) |
Btrfs: do aio_write instead of write
In order for AIO to work, we need to implement aio_write. This patch converts
our btrfs_file_write to btrfs_aio_write. I've tested this with xfstests and
nothing broke, and the AIO stuff magically started working. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | fs/btrfs/extent_io.c | 11 | ||||
-rw-r--r-- | fs/btrfs/file.c | 176 |
2 files changed, 104 insertions, 83 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a57c17d4029..a53aca338c7f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -2017,6 +2017,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
2017 | sector_t sector; | 2017 | sector_t sector; |
2018 | struct extent_map *em; | 2018 | struct extent_map *em; |
2019 | struct block_device *bdev; | 2019 | struct block_device *bdev; |
2020 | struct btrfs_ordered_extent *ordered; | ||
2020 | int ret; | 2021 | int ret; |
2021 | int nr = 0; | 2022 | int nr = 0; |
2022 | size_t page_offset = 0; | 2023 | size_t page_offset = 0; |
@@ -2028,7 +2029,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
2028 | set_page_extent_mapped(page); | 2029 | set_page_extent_mapped(page); |
2029 | 2030 | ||
2030 | end = page_end; | 2031 | end = page_end; |
2031 | lock_extent(tree, start, end, GFP_NOFS); | 2032 | while (1) { |
2033 | lock_extent(tree, start, end, GFP_NOFS); | ||
2034 | ordered = btrfs_lookup_ordered_extent(inode, start); | ||
2035 | if (!ordered) | ||
2036 | break; | ||
2037 | unlock_extent(tree, start, end, GFP_NOFS); | ||
2038 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
2039 | btrfs_put_ordered_extent(ordered); | ||
2040 | } | ||
2032 | 2041 | ||
2033 | if (page->index == last_byte >> PAGE_CACHE_SHIFT) { | 2042 | if (page->index == last_byte >> PAGE_CACHE_SHIFT) { |
2034 | char *userpage; | 2043 | char *userpage; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a28810abfb98..233aea2e5ef2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -46,32 +46,42 @@ | |||
46 | static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, | 46 | static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, |
47 | int write_bytes, | 47 | int write_bytes, |
48 | struct page **prepared_pages, | 48 | struct page **prepared_pages, |
49 | const char __user *buf) | 49 | struct iov_iter *i) |
50 | { | 50 | { |
51 | long page_fault = 0; | 51 | size_t copied; |
52 | int i; | 52 | int pg = 0; |
53 | int offset = pos & (PAGE_CACHE_SIZE - 1); | 53 | int offset = pos & (PAGE_CACHE_SIZE - 1); |
54 | 54 | ||
55 | for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { | 55 | while (write_bytes > 0) { |
56 | size_t count = min_t(size_t, | 56 | size_t count = min_t(size_t, |
57 | PAGE_CACHE_SIZE - offset, write_bytes); | 57 | PAGE_CACHE_SIZE - offset, write_bytes); |
58 | struct page *page = prepared_pages[i]; | 58 | struct page *page = prepared_pages[pg]; |
59 | fault_in_pages_readable(buf, count); | 59 | again: |
60 | if (unlikely(iov_iter_fault_in_readable(i, count))) | ||
61 | return -EFAULT; | ||
60 | 62 | ||
61 | /* Copy data from userspace to the current page */ | 63 | /* Copy data from userspace to the current page */ |
62 | kmap(page); | 64 | copied = iov_iter_copy_from_user(page, i, offset, count); |
63 | page_fault = __copy_from_user(page_address(page) + offset, | 65 | |
64 | buf, count); | ||
65 | /* Flush processor's dcache for this page */ | 66 | /* Flush processor's dcache for this page */ |
66 | flush_dcache_page(page); | 67 | flush_dcache_page(page); |
67 | kunmap(page); | 68 | iov_iter_advance(i, copied); |
68 | buf += count; | 69 | write_bytes -= copied; |
69 | write_bytes -= count; | ||
70 | 70 | ||
71 | if (page_fault) | 71 | if (unlikely(copied == 0)) { |
72 | break; | 72 | count = min_t(size_t, PAGE_CACHE_SIZE - offset, |
73 | iov_iter_single_seg_count(i)); | ||
74 | goto again; | ||
75 | } | ||
76 | |||
77 | if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { | ||
78 | offset += copied; | ||
79 | } else { | ||
80 | pg++; | ||
81 | offset = 0; | ||
82 | } | ||
73 | } | 83 | } |
74 | return page_fault ? -EFAULT : 0; | 84 | return 0; |
75 | } | 85 | } |
76 | 86 | ||
77 | /* | 87 | /* |
@@ -822,60 +832,24 @@ again: | |||
822 | return 0; | 832 | return 0; |
823 | } | 833 | } |
824 | 834 | ||
825 | /* Copied from read-write.c */ | 835 | static ssize_t btrfs_file_aio_write(struct kiocb *iocb, |
826 | static void wait_on_retry_sync_kiocb(struct kiocb *iocb) | 836 | const struct iovec *iov, |
827 | { | 837 | unsigned long nr_segs, loff_t pos) |
828 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
829 | if (!kiocbIsKicked(iocb)) | ||
830 | schedule(); | ||
831 | else | ||
832 | kiocbClearKicked(iocb); | ||
833 | __set_current_state(TASK_RUNNING); | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Just a copy of what do_sync_write does. | ||
838 | */ | ||
839 | static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, | ||
840 | size_t count, loff_t pos, loff_t *ppos) | ||
841 | { | 838 | { |
842 | struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; | 839 | struct file *file = iocb->ki_filp; |
843 | unsigned long nr_segs = 1; | 840 | struct inode *inode = fdentry(file)->d_inode; |
844 | struct kiocb kiocb; | 841 | struct btrfs_root *root = BTRFS_I(inode)->root; |
845 | ssize_t ret; | 842 | struct page *pinned[2]; |
846 | 843 | struct page **pages = NULL; | |
847 | init_sync_kiocb(&kiocb, file); | 844 | struct iov_iter i; |
848 | kiocb.ki_pos = pos; | 845 | loff_t *ppos = &iocb->ki_pos; |
849 | kiocb.ki_left = count; | ||
850 | kiocb.ki_nbytes = count; | ||
851 | |||
852 | while (1) { | ||
853 | ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, | ||
854 | ppos, count, count); | ||
855 | if (ret != -EIOCBRETRY) | ||
856 | break; | ||
857 | wait_on_retry_sync_kiocb(&kiocb); | ||
858 | } | ||
859 | |||
860 | if (ret == -EIOCBQUEUED) | ||
861 | ret = wait_on_sync_kiocb(&kiocb); | ||
862 | *ppos = kiocb.ki_pos; | ||
863 | return ret; | ||
864 | } | ||
865 | |||
866 | static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | ||
867 | size_t count, loff_t *ppos) | ||
868 | { | ||
869 | loff_t pos; | ||
870 | loff_t start_pos; | 846 | loff_t start_pos; |
871 | ssize_t num_written = 0; | 847 | ssize_t num_written = 0; |
872 | ssize_t err = 0; | 848 | ssize_t err = 0; |
849 | size_t count; | ||
850 | size_t ocount; | ||
873 | int ret = 0; | 851 | int ret = 0; |
874 | struct inode *inode = fdentry(file)->d_inode; | ||
875 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
876 | struct page **pages = NULL; | ||
877 | int nrptrs; | 852 | int nrptrs; |
878 | struct page *pinned[2]; | ||
879 | unsigned long first_index; | 853 | unsigned long first_index; |
880 | unsigned long last_index; | 854 | unsigned long last_index; |
881 | int will_write; | 855 | int will_write; |
@@ -887,13 +861,17 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
887 | pinned[0] = NULL; | 861 | pinned[0] = NULL; |
888 | pinned[1] = NULL; | 862 | pinned[1] = NULL; |
889 | 863 | ||
890 | pos = *ppos; | ||
891 | start_pos = pos; | 864 | start_pos = pos; |
892 | 865 | ||
893 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | 866 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
894 | 867 | ||
895 | mutex_lock(&inode->i_mutex); | 868 | mutex_lock(&inode->i_mutex); |
896 | 869 | ||
870 | err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); | ||
871 | if (err) | ||
872 | goto out; | ||
873 | count = ocount; | ||
874 | |||
897 | current->backing_dev_info = inode->i_mapping->backing_dev_info; | 875 | current->backing_dev_info = inode->i_mapping->backing_dev_info; |
898 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | 876 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); |
899 | if (err) | 877 | if (err) |
@@ -910,14 +888,48 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
910 | BTRFS_I(inode)->sequence++; | 888 | BTRFS_I(inode)->sequence++; |
911 | 889 | ||
912 | if (unlikely(file->f_flags & O_DIRECT)) { | 890 | if (unlikely(file->f_flags & O_DIRECT)) { |
913 | num_written = __btrfs_direct_write(file, buf, count, pos, | 891 | ret = btrfs_delalloc_reserve_space(inode, count); |
914 | ppos); | 892 | if (ret) |
915 | pos += num_written; | 893 | goto out; |
916 | count -= num_written; | ||
917 | 894 | ||
918 | /* We've written everything we wanted to, exit */ | 895 | num_written = generic_file_direct_write(iocb, iov, &nr_segs, |
919 | if (num_written < 0 || !count) | 896 | pos, ppos, count, |
897 | ocount); | ||
898 | |||
899 | /* | ||
900 | * the generic O_DIRECT will update in-memory i_size after the | ||
901 | * DIOs are done. But our endio handlers that update the on | ||
902 | * disk i_size never update past the in memory i_size. So we | ||
903 | * need one more update here to catch any additions to the | ||
904 | * file | ||
905 | */ | ||
906 | if (inode->i_size != BTRFS_I(inode)->disk_i_size) { | ||
907 | btrfs_ordered_update_i_size(inode, inode->i_size, NULL); | ||
908 | mark_inode_dirty(inode); | ||
909 | } | ||
910 | |||
911 | if (num_written < 0) { | ||
912 | if (num_written != -EIOCBQUEUED) { | ||
913 | /* | ||
914 | * aio land will take care of releasing the | ||
915 | * delalloc | ||
916 | */ | ||
917 | btrfs_delalloc_release_space(inode, count); | ||
918 | } | ||
919 | ret = num_written; | ||
920 | num_written = 0; | ||
920 | goto out; | 921 | goto out; |
922 | } else if (num_written == count) { | ||
923 | /* pick up pos changes done by the generic code */ | ||
924 | pos = *ppos; | ||
925 | goto out; | ||
926 | } | ||
927 | |||
928 | /* | ||
929 | * the buffered IO will reserve bytes for the rest of the | ||
930 | * range, don't double count them here | ||
931 | */ | ||
932 | btrfs_delalloc_release_space(inode, count - num_written); | ||
921 | 933 | ||
922 | /* | 934 | /* |
923 | * We are going to do buffered for the rest of the range, so we | 935 | * We are going to do buffered for the rest of the range, so we |
@@ -925,18 +937,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
925 | * done. | 937 | * done. |
926 | */ | 938 | */ |
927 | buffered = 1; | 939 | buffered = 1; |
928 | buf += num_written; | 940 | pos += num_written; |
929 | } | 941 | } |
930 | 942 | ||
931 | nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, | 943 | iov_iter_init(&i, iov, nr_segs, count, num_written); |
932 | PAGE_CACHE_SIZE / (sizeof(struct page *))); | 944 | nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / |
945 | PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / | ||
946 | (sizeof(struct page *))); | ||
933 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); | 947 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); |
934 | 948 | ||
935 | /* generic_write_checks can change our pos */ | 949 | /* generic_write_checks can change our pos */ |
936 | start_pos = pos; | 950 | start_pos = pos; |
937 | 951 | ||
938 | first_index = pos >> PAGE_CACHE_SHIFT; | 952 | first_index = pos >> PAGE_CACHE_SHIFT; |
939 | last_index = (pos + count) >> PAGE_CACHE_SHIFT; | 953 | last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; |
940 | 954 | ||
941 | /* | 955 | /* |
942 | * there are lots of better ways to do this, but this code | 956 | * there are lots of better ways to do this, but this code |
@@ -953,7 +967,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
953 | unlock_page(pinned[0]); | 967 | unlock_page(pinned[0]); |
954 | } | 968 | } |
955 | } | 969 | } |
956 | if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { | 970 | if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { |
957 | pinned[1] = grab_cache_page(inode->i_mapping, last_index); | 971 | pinned[1] = grab_cache_page(inode->i_mapping, last_index); |
958 | if (!PageUptodate(pinned[1])) { | 972 | if (!PageUptodate(pinned[1])) { |
959 | ret = btrfs_readpage(NULL, pinned[1]); | 973 | ret = btrfs_readpage(NULL, pinned[1]); |
@@ -964,10 +978,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
964 | } | 978 | } |
965 | } | 979 | } |
966 | 980 | ||
967 | while (count > 0) { | 981 | while (iov_iter_count(&i) > 0) { |
968 | size_t offset = pos & (PAGE_CACHE_SIZE - 1); | 982 | size_t offset = pos & (PAGE_CACHE_SIZE - 1); |
969 | size_t write_bytes = min(count, nrptrs * | 983 | size_t write_bytes = min(iov_iter_count(&i), |
970 | (size_t)PAGE_CACHE_SIZE - | 984 | nrptrs * (size_t)PAGE_CACHE_SIZE - |
971 | offset); | 985 | offset); |
972 | size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> | 986 | size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> |
973 | PAGE_CACHE_SHIFT; | 987 | PAGE_CACHE_SHIFT; |
@@ -988,7 +1002,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
988 | } | 1002 | } |
989 | 1003 | ||
990 | ret = btrfs_copy_from_user(pos, num_pages, | 1004 | ret = btrfs_copy_from_user(pos, num_pages, |
991 | write_bytes, pages, buf); | 1005 | write_bytes, pages, &i); |
992 | if (ret == 0) { | 1006 | if (ret == 0) { |
993 | dirty_and_release_pages(NULL, root, file, pages, | 1007 | dirty_and_release_pages(NULL, root, file, pages, |
994 | num_pages, pos, write_bytes); | 1008 | num_pages, pos, write_bytes); |
@@ -1012,8 +1026,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
1012 | btrfs_throttle(root); | 1026 | btrfs_throttle(root); |
1013 | } | 1027 | } |
1014 | 1028 | ||
1015 | buf += write_bytes; | ||
1016 | count -= write_bytes; | ||
1017 | pos += write_bytes; | 1029 | pos += write_bytes; |
1018 | num_written += write_bytes; | 1030 | num_written += write_bytes; |
1019 | 1031 | ||
@@ -1206,7 +1218,7 @@ const struct file_operations btrfs_file_operations = { | |||
1206 | .read = do_sync_read, | 1218 | .read = do_sync_read, |
1207 | .aio_read = generic_file_aio_read, | 1219 | .aio_read = generic_file_aio_read, |
1208 | .splice_read = generic_file_splice_read, | 1220 | .splice_read = generic_file_splice_read, |
1209 | .write = btrfs_file_write, | 1221 | .aio_write = btrfs_file_aio_write, |
1210 | .mmap = btrfs_file_mmap, | 1222 | .mmap = btrfs_file_mmap, |
1211 | .open = generic_file_open, | 1223 | .open = generic_file_open, |
1212 | .release = btrfs_release_file, | 1224 | .release = btrfs_release_file, |