aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/file-item.c25
-rw-r--r--fs/btrfs/file.c69
-rw-r--r--fs/btrfs/inode.c487
-rw-r--r--fs/btrfs/ordered-data.c75
-rw-r--r--fs/btrfs/ordered-data.h9
6 files changed, 631 insertions, 36 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5ed0223d1cbe..e9bf86415e86 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2317,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2317 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2319 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2320int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2321 struct btrfs_root *root, 2323 struct btrfs_root *root,
2322 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 21aead39a76c..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
149} 149}
150 150
151 151
152int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
153 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
154{ 155{
155 u32 sum; 156 u32 sum;
156 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
157 int bio_index = 0; 158 int bio_index = 0;
158 u64 offset; 159 u64 offset = 0;
159 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
160 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
161 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
174 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
175 176
176 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
177 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
178 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
179 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
180 if (ret == 0) 184 if (ret == 0)
181 goto found; 185 goto found;
@@ -238,6 +242,7 @@ found:
238 else 242 else
239 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
240 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
241 bio_index++; 246 bio_index++;
242 bvec++; 247 bvec++;
243 } 248 }
@@ -245,6 +250,18 @@ found:
245 return 0; 250 return 0;
246} 251}
247 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
248int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
249 struct list_head *list) 266 struct list_head *list)
250{ 267{
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6d8f817eadb5..a28810abfb98 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -822,6 +822,47 @@ again:
822 return 0; 822 return 0;
823} 823}
824 824
825/* Copied from read-write.c */
826static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
827{
828 set_current_state(TASK_UNINTERRUPTIBLE);
829 if (!kiocbIsKicked(iocb))
830 schedule();
831 else
832 kiocbClearKicked(iocb);
833 __set_current_state(TASK_RUNNING);
834}
835
836/*
837 * Just a copy of what do_sync_write does.
838 */
839static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf,
840 size_t count, loff_t pos, loff_t *ppos)
841{
842 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
843 unsigned long nr_segs = 1;
844 struct kiocb kiocb;
845 ssize_t ret;
846
847 init_sync_kiocb(&kiocb, file);
848 kiocb.ki_pos = pos;
849 kiocb.ki_left = count;
850 kiocb.ki_nbytes = count;
851
852 while (1) {
853 ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos,
854 ppos, count, count);
855 if (ret != -EIOCBRETRY)
856 break;
857 wait_on_retry_sync_kiocb(&kiocb);
858 }
859
860 if (ret == -EIOCBQUEUED)
861 ret = wait_on_sync_kiocb(&kiocb);
862 *ppos = kiocb.ki_pos;
863 return ret;
864}
865
825static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 866static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
826 size_t count, loff_t *ppos) 867 size_t count, loff_t *ppos)
827{ 868{
@@ -838,12 +879,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
838 unsigned long first_index; 879 unsigned long first_index;
839 unsigned long last_index; 880 unsigned long last_index;
840 int will_write; 881 int will_write;
882 int buffered = 0;
841 883
842 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 884 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
843 (file->f_flags & O_DIRECT)); 885 (file->f_flags & O_DIRECT));
844 886
845 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
846 PAGE_CACHE_SIZE / (sizeof(struct page *)));
847 pinned[0] = NULL; 887 pinned[0] = NULL;
848 pinned[1] = NULL; 888 pinned[1] = NULL;
849 889
@@ -867,13 +907,34 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
867 goto out; 907 goto out;
868 908
869 file_update_time(file); 909 file_update_time(file);
910 BTRFS_I(inode)->sequence++;
911
912 if (unlikely(file->f_flags & O_DIRECT)) {
913 num_written = __btrfs_direct_write(file, buf, count, pos,
914 ppos);
915 pos += num_written;
916 count -= num_written;
917
918 /* We've written everything we wanted to, exit */
919 if (num_written < 0 || !count)
920 goto out;
870 921
922 /*
923 * We are going to do buffered for the rest of the range, so we
924 * need to make sure to invalidate the buffered pages when we're
925 * done.
926 */
927 buffered = 1;
928 buf += num_written;
929 }
930
931 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
932 PAGE_CACHE_SIZE / (sizeof(struct page *)));
871 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 933 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
872 934
873 /* generic_write_checks can change our pos */ 935 /* generic_write_checks can change our pos */
874 start_pos = pos; 936 start_pos = pos;
875 937
876 BTRFS_I(inode)->sequence++;
877 first_index = pos >> PAGE_CACHE_SHIFT; 938 first_index = pos >> PAGE_CACHE_SHIFT;
878 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 939 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
879 940
@@ -1007,7 +1068,7 @@ out:
1007 btrfs_end_transaction(trans, root); 1068 btrfs_end_transaction(trans, root);
1008 } 1069 }
1009 } 1070 }
1010 if (file->f_flags & O_DIRECT) { 1071 if (file->f_flags & O_DIRECT && buffered) {
1011 invalidate_mapping_pages(inode->i_mapping, 1072 invalidate_mapping_pages(inode->i_mapping,
1012 start_pos >> PAGE_CACHE_SHIFT, 1073 start_pos >> PAGE_CACHE_SHIFT,
1013 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1074 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 460dd512eebd..1695440a59a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -698,6 +698,38 @@ retry:
698 return 0; 698 return 0;
699} 699}
700 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
701/* 733/*
702 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
703 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -770,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
770 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
771 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
772 804
773 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
774 read_lock(&BTRFS_I(inode)->extent_tree.lock);
775 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
776 start, num_bytes);
777 if (em) {
778 /*
779 * if block start isn't an actual block number then find the
780 * first block in this inode and use that as a hint. If that
781 * block is also bogus then just don't worry about it.
782 */
783 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
784 free_extent_map(em);
785 em = search_extent_mapping(em_tree, 0, 0);
786 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
787 alloc_hint = em->block_start;
788 if (em)
789 free_extent_map(em);
790 } else {
791 alloc_hint = em->block_start;
792 free_extent_map(em);
793 }
794 }
795 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
796 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
797 807
798 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -5171,11 +5181,440 @@ out:
5171 return em; 5181 return em;
5172} 5182}
5173 5183
5184static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5185 u64 start, u64 len)
5186{
5187 struct btrfs_root *root = BTRFS_I(inode)->root;
5188 struct btrfs_trans_handle *trans;
5189 struct extent_map *em;
5190 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5191 struct btrfs_key ins;
5192 u64 alloc_hint;
5193 int ret;
5194
5195 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5196
5197 trans = btrfs_join_transaction(root, 0);
5198 if (!trans)
5199 return ERR_PTR(-ENOMEM);
5200
5201 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5202
5203 alloc_hint = get_extent_allocation_hint(inode, start, len);
5204 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5205 alloc_hint, (u64)-1, &ins, 1);
5206 if (ret) {
5207 em = ERR_PTR(ret);
5208 goto out;
5209 }
5210
5211 em = alloc_extent_map(GFP_NOFS);
5212 if (!em) {
5213 em = ERR_PTR(-ENOMEM);
5214 goto out;
5215 }
5216
5217 em->start = start;
5218 em->orig_start = em->start;
5219 em->len = ins.offset;
5220
5221 em->block_start = ins.objectid;
5222 em->block_len = ins.offset;
5223 em->bdev = root->fs_info->fs_devices->latest_bdev;
5224 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5225
5226 while (1) {
5227 write_lock(&em_tree->lock);
5228 ret = add_extent_mapping(em_tree, em);
5229 write_unlock(&em_tree->lock);
5230 if (ret != -EEXIST)
5231 break;
5232 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5233 }
5234
5235 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5236 ins.offset, ins.offset, 0);
5237 if (ret) {
5238 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5239 em = ERR_PTR(ret);
5240 }
5241out:
5242 btrfs_end_transaction(trans, root);
5243 return em;
5244}
5245
5246static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5247 struct buffer_head *bh_result, int create)
5248{
5249 struct extent_map *em;
5250 struct btrfs_root *root = BTRFS_I(inode)->root;
5251 u64 start = iblock << inode->i_blkbits;
5252 u64 len = bh_result->b_size;
5253
5254 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5255 if (IS_ERR(em))
5256 return PTR_ERR(em);
5257
5258 /*
5259 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5260 * io. INLINE is special, and we could probably kludge it in here, but
5261 * it's still buffered so for safety lets just fall back to the generic
5262 * buffered path.
5263 *
5264 * For COMPRESSED we _have_ to read the entire extent in so we can
5265 * decompress it, so there will be buffering required no matter what we
5266 * do, so go ahead and fallback to buffered.
5267 *
5268 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5269 * to buffered IO. Don't blame me, this is the price we pay for using
5270 * the generic code.
5271 */
5272 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5273 em->block_start == EXTENT_MAP_INLINE) {
5274 free_extent_map(em);
5275 return -ENOTBLK;
5276 }
5277
5278 /* Just a good old fashioned hole, return */
5279 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5280 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5281 free_extent_map(em);
5282 /* DIO will do one hole at a time, so just unlock a sector */
5283 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5284 start + root->sectorsize - 1, GFP_NOFS);
5285 return 0;
5286 }
5287
5288 /*
5289 * We don't allocate a new extent in the following cases
5290 *
5291 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5292 * existing extent.
5293 * 2) The extent is marked as PREALLOC. We're good to go here and can
5294 * just use the extent.
5295 *
5296 */
5297 if (!create)
5298 goto map;
5299
5300 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5301 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5302 em->block_start != EXTENT_MAP_HOLE)) {
5303 u64 block_start;
5304 int type;
5305 int ret;
5306
5307 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5308 type = BTRFS_ORDERED_PREALLOC;
5309 else
5310 type = BTRFS_ORDERED_NOCOW;
5311 len = min(len, em->block_len - (start - em->start));
5312 block_start = em->block_start + (start - em->start);
5313 ret = btrfs_add_ordered_extent_dio(inode, start,
5314 start, len, len, type);
5315 if (ret) {
5316 free_extent_map(em);
5317 return ret;
5318 }
5319 } else {
5320 free_extent_map(em);
5321 em = btrfs_new_extent_direct(inode, start, len);
5322 if (IS_ERR(em))
5323 return PTR_ERR(em);
5324 len = min(len, em->block_len);
5325 }
5326 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5327 GFP_NOFS);
5328map:
5329 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5330 inode->i_blkbits;
5331 bh_result->b_size = em->len - (start - em->start);
5332 bh_result->b_bdev = em->bdev;
5333 set_buffer_mapped(bh_result);
5334 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5335 set_buffer_new(bh_result);
5336
5337 free_extent_map(em);
5338
5339 return 0;
5340}
5341
5342struct btrfs_dio_private {
5343 struct inode *inode;
5344 u64 logical_offset;
5345 u64 disk_bytenr;
5346 u64 bytes;
5347 u32 *csums;
5348 void *private;
5349};
5350
5351static void btrfs_endio_direct_read(struct bio *bio, int err)
5352{
5353 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5354 struct bio_vec *bvec = bio->bi_io_vec;
5355 struct btrfs_dio_private *dip = bio->bi_private;
5356 struct inode *inode = dip->inode;
5357 struct btrfs_root *root = BTRFS_I(inode)->root;
5358 u64 start;
5359 u32 *private = dip->csums;
5360
5361 start = dip->logical_offset;
5362 do {
5363 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5364 struct page *page = bvec->bv_page;
5365 char *kaddr;
5366 u32 csum = ~(u32)0;
5367 unsigned long flags;
5368
5369 local_irq_save(flags);
5370 kaddr = kmap_atomic(page, KM_IRQ0);
5371 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5372 csum, bvec->bv_len);
5373 btrfs_csum_final(csum, (char *)&csum);
5374 kunmap_atomic(kaddr, KM_IRQ0);
5375 local_irq_restore(flags);
5376
5377 flush_dcache_page(bvec->bv_page);
5378 if (csum != *private) {
5379 printk(KERN_ERR "btrfs csum failed ino %lu off"
5380 " %llu csum %u private %u\n",
5381 inode->i_ino, (unsigned long long)start,
5382 csum, *private);
5383 err = -EIO;
5384 }
5385 }
5386
5387 start += bvec->bv_len;
5388 private++;
5389 bvec++;
5390 } while (bvec <= bvec_end);
5391
5392 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5393 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5394 bio->bi_private = dip->private;
5395
5396 kfree(dip->csums);
5397 kfree(dip);
5398 dio_end_io(bio, err);
5399}
5400
5401static void btrfs_endio_direct_write(struct bio *bio, int err)
5402{
5403 struct btrfs_dio_private *dip = bio->bi_private;
5404 struct inode *inode = dip->inode;
5405 struct btrfs_root *root = BTRFS_I(inode)->root;
5406 struct btrfs_trans_handle *trans;
5407 struct btrfs_ordered_extent *ordered = NULL;
5408 struct extent_state *cached_state = NULL;
5409 int ret;
5410
5411 if (err)
5412 goto out_done;
5413
5414 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5415 dip->logical_offset, dip->bytes);
5416 if (!ret)
5417 goto out_done;
5418
5419 BUG_ON(!ordered);
5420
5421 trans = btrfs_join_transaction(root, 1);
5422 if (!trans) {
5423 err = -ENOMEM;
5424 goto out;
5425 }
5426 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5427
5428 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5429 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5430 if (!ret)
5431 ret = btrfs_update_inode(trans, root, inode);
5432 err = ret;
5433 goto out;
5434 }
5435
5436 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5437 ordered->file_offset + ordered->len - 1, 0,
5438 &cached_state, GFP_NOFS);
5439
5440 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5441 ret = btrfs_mark_extent_written(trans, inode,
5442 ordered->file_offset,
5443 ordered->file_offset +
5444 ordered->len);
5445 if (ret) {
5446 err = ret;
5447 goto out_unlock;
5448 }
5449 } else {
5450 ret = insert_reserved_file_extent(trans, inode,
5451 ordered->file_offset,
5452 ordered->start,
5453 ordered->disk_len,
5454 ordered->len,
5455 ordered->len,
5456 0, 0, 0,
5457 BTRFS_FILE_EXTENT_REG);
5458 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5459 ordered->file_offset, ordered->len);
5460 if (ret) {
5461 err = ret;
5462 WARN_ON(1);
5463 goto out_unlock;
5464 }
5465 }
5466
5467 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5468 btrfs_ordered_update_i_size(inode, 0, ordered);
5469 btrfs_update_inode(trans, root, inode);
5470out_unlock:
5471 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5472 ordered->file_offset + ordered->len - 1,
5473 &cached_state, GFP_NOFS);
5474out:
5475 btrfs_delalloc_release_metadata(inode, ordered->len);
5476 btrfs_end_transaction(trans, root);
5477 btrfs_put_ordered_extent(ordered);
5478 btrfs_put_ordered_extent(ordered);
5479out_done:
5480 bio->bi_private = dip->private;
5481
5482 kfree(dip->csums);
5483 kfree(dip);
5484 dio_end_io(bio, err);
5485}
5486
5487static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5488 loff_t file_offset)
5489{
5490 struct btrfs_root *root = BTRFS_I(inode)->root;
5491 struct btrfs_dio_private *dip;
5492 struct bio_vec *bvec = bio->bi_io_vec;
5493 u64 start;
5494 int skip_sum;
5495 int write = rw & (1 << BIO_RW);
5496 int ret = 0;
5497
5498 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5499
5500 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5501 if (!dip) {
5502 ret = -ENOMEM;
5503 goto free_ordered;
5504 }
5505 dip->csums = NULL;
5506
5507 if (!skip_sum) {
5508 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5509 if (!dip->csums) {
5510 ret = -ENOMEM;
5511 goto free_ordered;
5512 }
5513 }
5514
5515 dip->private = bio->bi_private;
5516 dip->inode = inode;
5517 dip->logical_offset = file_offset;
5518
5519 start = dip->logical_offset;
5520 dip->bytes = 0;
5521 do {
5522 dip->bytes += bvec->bv_len;
5523 bvec++;
5524 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5525
5526 dip->disk_bytenr = bio->bi_sector << 9;
5527 bio->bi_private = dip;
5528
5529 if (write)
5530 bio->bi_end_io = btrfs_endio_direct_write;
5531 else
5532 bio->bi_end_io = btrfs_endio_direct_read;
5533
5534 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5535 if (ret)
5536 goto out_err;
5537
5538 if (write && !skip_sum)
5539 btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1);
5540 else if (!skip_sum)
5541 btrfs_lookup_bio_sums_dio(root, inode, bio,
5542 dip->logical_offset, dip->csums);
5543
5544 ret = btrfs_map_bio(root, rw, bio, 0, 0);
5545 if (ret)
5546 goto out_err;
5547 return;
5548out_err:
5549 kfree(dip->csums);
5550 kfree(dip);
5551free_ordered:
5552 /*
5553 * If this is a write, we need to clean up the reserved space and kill
5554 * the ordered extent.
5555 */
5556 if (write) {
5557 struct btrfs_ordered_extent *ordered;
5558 ordered = btrfs_lookup_ordered_extent(inode,
5559 dip->logical_offset);
5560 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5561 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5562 btrfs_free_reserved_extent(root, ordered->start,
5563 ordered->disk_len);
5564 btrfs_put_ordered_extent(ordered);
5565 btrfs_put_ordered_extent(ordered);
5566 }
5567 bio_endio(bio, ret);
5568}
5569
5174static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5570static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
5175 const struct iovec *iov, loff_t offset, 5571 const struct iovec *iov, loff_t offset,
5176 unsigned long nr_segs) 5572 unsigned long nr_segs)
5177{ 5573{
5178 return -EINVAL; 5574 struct file *file = iocb->ki_filp;
5575 struct inode *inode = file->f_mapping->host;
5576 struct btrfs_ordered_extent *ordered;
5577 u64 lockstart, lockend;
5578 ssize_t ret;
5579
5580 lockstart = offset;
5581 lockend = offset + iov_length(iov, nr_segs) - 1;
5582 while (1) {
5583 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5584 GFP_NOFS);
5585 /*
5586 * We're concerned with the entire range that we're going to be
5587 * doing DIO to, so we need to make sure theres no ordered
5588 * extents in this range.
5589 */
5590 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5591 lockend - lockstart + 1);
5592 if (!ordered)
5593 break;
5594 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5595 GFP_NOFS);
5596 btrfs_start_ordered_extent(inode, ordered, 1);
5597 btrfs_put_ordered_extent(ordered);
5598 cond_resched();
5599 }
5600
5601 ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
5602 btrfs_get_blocks_direct, NULL,
5603 btrfs_submit_direct, 0);
5604
5605 if (ret < 0 && ret != -EIOCBQUEUED) {
5606 unlock_extent(&BTRFS_I(inode)->io_tree, offset,
5607 offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
5608 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5609 /*
5610 * We're falling back to buffered, unlock the section we didn't
5611 * do IO on.
5612 */
5613 unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
5614 offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
5615 }
5616
5617 return ret;
5179} 5618}
5180 5619
5181static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5620static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c9f1020572f2..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
124 return 1; 124 return 1;
125} 125}
126 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
127/* 136/*
128 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
129 * the first one less than this offset 138 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
161 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
162 * inserted. 171 * inserted.
163 */ 172 */
164int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
165 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
166{ 176{
167 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
168 struct rb_node *node; 178 struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
182 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
183 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
184 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
185 /* one ref for the tree */ 198 /* one ref for the tree */
186 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
187 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
203 return 0; 216 return 0;
204} 217}
205 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
206/* 233/*
207 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
208 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -484,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
484 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
485 * for pdflush to find them 512 * for pdflush to find them
486 */ 513 */
487 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
488 if (wait) { 516 if (wait) {
489 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
490 &entry->flags)); 518 &entry->flags));
@@ -581,6 +609,47 @@ out:
581 return entry; 609 return entry;
582} 610}
583 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
584/* 653/*
585 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
586 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);