diff options
author | Josef Bacik <josef@redhat.com> | 2010-05-23 11:00:55 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2010-05-25 10:34:57 -0400 |
commit | 4b46fce23349bfca781a32e2707a18328ca5ae22 (patch) | |
tree | 68f1200f2bc82d3f35218aef38e6d5d92bff4aca /fs/btrfs/file.c | |
parent | c2c6ca417e2db7a519e6e92c82f4a933d940d076 (diff) |
Btrfs: add basic DIO read/write support
This provides basic DIO support for reading and writing. It does not do the
work to recover from mismatching checksums, that will come later. A few design
changes have been made from Jim's code (sorry Jim!)
1) Use the generic direct-io code. Jim originally re-wrote all the generic DIO
code in order to account for all of BTRFS's oddities, but thanks to that work it
seems like the best bet is to just ignore compression and such and just opt to
fallback on buffered IO.
2) Fallback on buffered IO for compressed or inline extents. Jim's code did
it's own buffering to make dio with compressed extents work. Now we just
fallback onto normal buffered IO.
3) Use ordered extents for the writes so that all of the
lock_extent()
lookup_ordered()
type checks continue to work.
4) Do the lock_extent() lookup_ordered() loop in readpage so we don't race with
DIO writes.
I've tested this with fsx and everything works great. This patch depends on my
dio and filemap.c patches to work. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 69 |
1 files changed, 65 insertions, 4 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6d8f817eadb5..a28810abfb98 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -822,6 +822,47 @@ again: | |||
822 | return 0; | 822 | return 0; |
823 | } | 823 | } |
824 | 824 | ||
825 | /* Copied from read-write.c */ | ||
826 | static void wait_on_retry_sync_kiocb(struct kiocb *iocb) | ||
827 | { | ||
828 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
829 | if (!kiocbIsKicked(iocb)) | ||
830 | schedule(); | ||
831 | else | ||
832 | kiocbClearKicked(iocb); | ||
833 | __set_current_state(TASK_RUNNING); | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Just a copy of what do_sync_write does. | ||
838 | */ | ||
839 | static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, | ||
840 | size_t count, loff_t pos, loff_t *ppos) | ||
841 | { | ||
842 | struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; | ||
843 | unsigned long nr_segs = 1; | ||
844 | struct kiocb kiocb; | ||
845 | ssize_t ret; | ||
846 | |||
847 | init_sync_kiocb(&kiocb, file); | ||
848 | kiocb.ki_pos = pos; | ||
849 | kiocb.ki_left = count; | ||
850 | kiocb.ki_nbytes = count; | ||
851 | |||
852 | while (1) { | ||
853 | ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, | ||
854 | ppos, count, count); | ||
855 | if (ret != -EIOCBRETRY) | ||
856 | break; | ||
857 | wait_on_retry_sync_kiocb(&kiocb); | ||
858 | } | ||
859 | |||
860 | if (ret == -EIOCBQUEUED) | ||
861 | ret = wait_on_sync_kiocb(&kiocb); | ||
862 | *ppos = kiocb.ki_pos; | ||
863 | return ret; | ||
864 | } | ||
865 | |||
825 | static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | 866 | static ssize_t btrfs_file_write(struct file *file, const char __user *buf, |
826 | size_t count, loff_t *ppos) | 867 | size_t count, loff_t *ppos) |
827 | { | 868 | { |
@@ -838,12 +879,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
838 | unsigned long first_index; | 879 | unsigned long first_index; |
839 | unsigned long last_index; | 880 | unsigned long last_index; |
840 | int will_write; | 881 | int will_write; |
882 | int buffered = 0; | ||
841 | 883 | ||
842 | will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || | 884 | will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || |
843 | (file->f_flags & O_DIRECT)); | 885 | (file->f_flags & O_DIRECT)); |
844 | 886 | ||
845 | nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, | ||
846 | PAGE_CACHE_SIZE / (sizeof(struct page *))); | ||
847 | pinned[0] = NULL; | 887 | pinned[0] = NULL; |
848 | pinned[1] = NULL; | 888 | pinned[1] = NULL; |
849 | 889 | ||
@@ -867,13 +907,34 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | |||
867 | goto out; | 907 | goto out; |
868 | 908 | ||
869 | file_update_time(file); | 909 | file_update_time(file); |
910 | BTRFS_I(inode)->sequence++; | ||
911 | |||
912 | if (unlikely(file->f_flags & O_DIRECT)) { | ||
913 | num_written = __btrfs_direct_write(file, buf, count, pos, | ||
914 | ppos); | ||
915 | pos += num_written; | ||
916 | count -= num_written; | ||
917 | |||
918 | /* We've written everything we wanted to, exit */ | ||
919 | if (num_written < 0 || !count) | ||
920 | goto out; | ||
870 | 921 | ||
922 | /* | ||
923 | * We are going to do buffered for the rest of the range, so we | ||
924 | * need to make sure to invalidate the buffered pages when we're | ||
925 | * done. | ||
926 | */ | ||
927 | buffered = 1; | ||
928 | buf += num_written; | ||
929 | } | ||
930 | |||
931 | nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, | ||
932 | PAGE_CACHE_SIZE / (sizeof(struct page *))); | ||
871 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); | 933 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); |
872 | 934 | ||
873 | /* generic_write_checks can change our pos */ | 935 | /* generic_write_checks can change our pos */ |
874 | start_pos = pos; | 936 | start_pos = pos; |
875 | 937 | ||
876 | BTRFS_I(inode)->sequence++; | ||
877 | first_index = pos >> PAGE_CACHE_SHIFT; | 938 | first_index = pos >> PAGE_CACHE_SHIFT; |
878 | last_index = (pos + count) >> PAGE_CACHE_SHIFT; | 939 | last_index = (pos + count) >> PAGE_CACHE_SHIFT; |
879 | 940 | ||
@@ -1007,7 +1068,7 @@ out: | |||
1007 | btrfs_end_transaction(trans, root); | 1068 | btrfs_end_transaction(trans, root); |
1008 | } | 1069 | } |
1009 | } | 1070 | } |
1010 | if (file->f_flags & O_DIRECT) { | 1071 | if (file->f_flags & O_DIRECT && buffered) { |
1011 | invalidate_mapping_pages(inode->i_mapping, | 1072 | invalidate_mapping_pages(inode->i_mapping, |
1012 | start_pos >> PAGE_CACHE_SHIFT, | 1073 | start_pos >> PAGE_CACHE_SHIFT, |
1013 | (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); | 1074 | (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); |