aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-07-17 12:53:50 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:04 -0400
commite6dcd2dc9c489108648e2ed543315dd134d50a9a (patch)
treecddf6f588b65e28c5feb8bff89b22d8ff70f8a50 /fs/btrfs/inode.c
parent77a41afb7d0dd0f27b6f2f1a5bc701929c7034de (diff)
Btrfs: New data=ordered implementation
The old data=ordered code would force commit to wait until all the data extents from the transaction were fully on disk. This introduced large latencies into the commit and stalled new writers in the transaction for a long time. The new code changes the way data allocations and extents work: * When delayed allocation is filled, data extents are reserved, and the extent bit EXTENT_ORDERED is set on the entire range of the extent. A struct btrfs_ordered_extent is allocated an inserted into a per-inode rbtree to track the pending extents. * As each page is written EXTENT_ORDERED is cleared on the bytes corresponding to that page. * When all of the bytes corresponding to a single struct btrfs_ordered_extent are written, The previously reserved extent is inserted into the FS btree and into the extent allocation trees. The checksums for the file data are also updated. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c447
1 files changed, 289 insertions, 158 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d39433dfb2c7..c5a62f0b9595 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
43#include "ioctl.h" 43#include "ioctl.h"
44#include "print-tree.h" 44#include "print-tree.h"
45#include "volumes.h" 45#include "volumes.h"
46#include "ordered-data.h"
46 47
47struct btrfs_iget_args { 48struct btrfs_iget_args {
48 u64 ino; 49 u64 ino;
@@ -109,10 +110,11 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
109 u64 num_bytes; 110 u64 num_bytes;
110 u64 cur_alloc_size; 111 u64 cur_alloc_size;
111 u64 blocksize = root->sectorsize; 112 u64 blocksize = root->sectorsize;
112 u64 orig_start = start;
113 u64 orig_num_bytes; 113 u64 orig_num_bytes;
114 struct btrfs_key ins; 114 struct btrfs_key ins;
115 int ret; 115 struct extent_map *em;
116 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
117 int ret = 0;
116 118
117 trans = btrfs_start_transaction(root, 1); 119 trans = btrfs_start_transaction(root, 1);
118 BUG_ON(!trans); 120 BUG_ON(!trans);
@@ -120,33 +122,44 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
120 122
121 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 123 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
122 num_bytes = max(blocksize, num_bytes); 124 num_bytes = max(blocksize, num_bytes);
123 ret = btrfs_drop_extents(trans, root, inode,
124 start, start + num_bytes, start, &alloc_hint);
125 orig_num_bytes = num_bytes; 125 orig_num_bytes = num_bytes;
126 126
127 if (alloc_hint == EXTENT_MAP_INLINE) 127 if (alloc_hint == EXTENT_MAP_INLINE)
128 goto out; 128 goto out;
129 129
130 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); 130 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
131 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
131 132
132 while(num_bytes > 0) { 133 while(num_bytes > 0) {
133 cur_alloc_size = min(num_bytes, root->fs_info->max_extent); 134 cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
134 ret = btrfs_alloc_extent(trans, root, cur_alloc_size, 135 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
135 root->sectorsize, 136 root->sectorsize, 0, 0,
136 root->root_key.objectid, 137 (u64)-1, &ins, 1);
137 trans->transid,
138 inode->i_ino, start, 0,
139 alloc_hint, (u64)-1, &ins, 1);
140 if (ret) { 138 if (ret) {
141 WARN_ON(1); 139 WARN_ON(1);
142 goto out; 140 goto out;
143 } 141 }
142 em = alloc_extent_map(GFP_NOFS);
143 em->start = start;
144 em->len = ins.offset;
145 em->block_start = ins.objectid;
146 em->bdev = root->fs_info->fs_devices->latest_bdev;
147 while(1) {
148 spin_lock(&em_tree->lock);
149 ret = add_extent_mapping(em_tree, em);
150 spin_unlock(&em_tree->lock);
151 if (ret != -EEXIST) {
152 free_extent_map(em);
153 break;
154 }
155 btrfs_drop_extent_cache(inode, start,
156 start + ins.offset - 1);
157 }
158
144 cur_alloc_size = ins.offset; 159 cur_alloc_size = ins.offset;
145 ret = btrfs_insert_file_extent(trans, root, inode->i_ino, 160 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
146 start, ins.objectid, ins.offset, 161 ins.offset);
147 ins.offset, 0); 162 BUG_ON(ret);
148 inode->i_blocks += ins.offset >> 9;
149 btrfs_check_file(root, inode);
150 if (num_bytes < cur_alloc_size) { 163 if (num_bytes < cur_alloc_size) {
151 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, 164 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
152 cur_alloc_size); 165 cur_alloc_size);
@@ -156,10 +169,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
156 alloc_hint = ins.objectid + ins.offset; 169 alloc_hint = ins.objectid + ins.offset;
157 start += cur_alloc_size; 170 start += cur_alloc_size;
158 } 171 }
159 btrfs_drop_extent_cache(inode, orig_start,
160 orig_start + orig_num_bytes - 1);
161 btrfs_add_ordered_inode(inode);
162 btrfs_update_inode(trans, root, inode);
163out: 172out:
164 btrfs_end_transaction(trans, root); 173 btrfs_end_transaction(trans, root);
165 return ret; 174 return ret;
@@ -341,25 +350,15 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
341 int mirror_num) 350 int mirror_num)
342{ 351{
343 struct btrfs_root *root = BTRFS_I(inode)->root; 352 struct btrfs_root *root = BTRFS_I(inode)->root;
344 struct btrfs_trans_handle *trans;
345 int ret = 0; 353 int ret = 0;
346 char *sums = NULL; 354 struct btrfs_ordered_sum *sums;
347 355
348 ret = btrfs_csum_one_bio(root, bio, &sums); 356 ret = btrfs_csum_one_bio(root, bio, &sums);
349 BUG_ON(ret); 357 BUG_ON(ret);
350 358
351 trans = btrfs_start_transaction(root, 1); 359 ret = btrfs_add_ordered_sum(inode, sums);
352
353 btrfs_set_trans_block_group(trans, inode);
354 mutex_lock(&BTRFS_I(inode)->csum_mutex);
355 btrfs_csum_file_blocks(trans, root, inode, bio, sums);
356 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
357
358 ret = btrfs_end_transaction(trans, root);
359 BUG_ON(ret); 360 BUG_ON(ret);
360 361
361 kfree(sums);
362
363 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 362 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
364} 363}
365 364
@@ -369,14 +368,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
369 struct btrfs_root *root = BTRFS_I(inode)->root; 368 struct btrfs_root *root = BTRFS_I(inode)->root;
370 int ret = 0; 369 int ret = 0;
371 370
372 if (!(rw & (1 << BIO_RW))) { 371 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
373 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 372 BUG_ON(ret);
374 BUG_ON(ret);
375 goto mapit;
376 }
377 373
378 if (btrfs_test_opt(root, NODATASUM) || 374 if (!(rw & (1 << BIO_RW))) {
379 btrfs_test_flag(inode, NODATASUM)) {
380 goto mapit; 375 goto mapit;
381 } 376 }
382 377
@@ -387,6 +382,96 @@ mapit:
387 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 382 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
388} 383}
389 384
385static int add_pending_csums(struct btrfs_trans_handle *trans,
386 struct inode *inode, u64 file_offset,
387 struct list_head *list)
388{
389 struct list_head *cur;
390 struct btrfs_ordered_sum *sum;
391
392 btrfs_set_trans_block_group(trans, inode);
393 while(!list_empty(list)) {
394 cur = list->next;
395 sum = list_entry(cur, struct btrfs_ordered_sum, list);
396 mutex_lock(&BTRFS_I(inode)->csum_mutex);
397 btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
398 inode, sum);
399 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
400 list_del(&sum->list);
401 kfree(sum);
402 }
403 return 0;
404}
405
406int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
407 struct extent_state *state, int uptodate)
408{
409 struct inode *inode = page->mapping->host;
410 struct btrfs_root *root = BTRFS_I(inode)->root;
411 struct btrfs_trans_handle *trans;
412 struct btrfs_ordered_extent *ordered_extent;
413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
414 u64 alloc_hint = 0;
415 struct list_head list;
416 struct btrfs_key ins;
417 int ret;
418
419 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
420 if (!ret) {
421 return 0;
422 }
423
424 trans = btrfs_start_transaction(root, 1);
425
426 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
427 BUG_ON(!ordered_extent);
428
429 lock_extent(io_tree, ordered_extent->file_offset,
430 ordered_extent->file_offset + ordered_extent->len - 1,
431 GFP_NOFS);
432
433 INIT_LIST_HEAD(&list);
434
435 ins.objectid = ordered_extent->start;
436 ins.offset = ordered_extent->len;
437 ins.type = BTRFS_EXTENT_ITEM_KEY;
438 ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
439 trans->transid, inode->i_ino,
440 ordered_extent->file_offset, &ins);
441 BUG_ON(ret);
442 ret = btrfs_drop_extents(trans, root, inode,
443 ordered_extent->file_offset,
444 ordered_extent->file_offset +
445 ordered_extent->len,
446 ordered_extent->file_offset, &alloc_hint);
447 BUG_ON(ret);
448 ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
449 ordered_extent->file_offset,
450 ordered_extent->start,
451 ordered_extent->len,
452 ordered_extent->len, 0);
453 BUG_ON(ret);
454 btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
455 ordered_extent->file_offset +
456 ordered_extent->len - 1);
457 inode->i_blocks += ordered_extent->len >> 9;
458 unlock_extent(io_tree, ordered_extent->file_offset,
459 ordered_extent->file_offset + ordered_extent->len - 1,
460 GFP_NOFS);
461 add_pending_csums(trans, inode, ordered_extent->file_offset,
462 &ordered_extent->list);
463
464 btrfs_remove_ordered_extent(inode, ordered_extent);
465 /* once for us */
466 btrfs_put_ordered_extent(ordered_extent);
467 /* once for the tree */
468 btrfs_put_ordered_extent(ordered_extent);
469
470 btrfs_update_inode(trans, root, inode);
471 btrfs_end_transaction(trans, root);
472 return 0;
473}
474
390int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) 475int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
391{ 476{
392 int ret = 0; 477 int ret = 0;
@@ -409,7 +494,8 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
409 if (ret == -ENOENT || ret == -EFBIG) 494 if (ret == -ENOENT || ret == -EFBIG)
410 ret = 0; 495 ret = 0;
411 csum = 0; 496 csum = 0;
412 printk("no csum found for inode %lu start %Lu\n", inode->i_ino, start); 497 printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
498 start);
413 goto out; 499 goto out;
414 } 500 }
415 read_extent_buffer(path->nodes[0], &csum, (unsigned long)item, 501 read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
@@ -833,7 +919,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
833{ 919{
834 struct btrfs_root *root; 920 struct btrfs_root *root;
835 struct btrfs_trans_handle *trans; 921 struct btrfs_trans_handle *trans;
836 struct inode *inode = dentry->d_inode;
837 int ret; 922 int ret;
838 unsigned long nr = 0; 923 unsigned long nr = 0;
839 924
@@ -849,14 +934,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
849 ret = btrfs_unlink_trans(trans, root, dir, dentry); 934 ret = btrfs_unlink_trans(trans, root, dir, dentry);
850 nr = trans->blocks_used; 935 nr = trans->blocks_used;
851 936
852 if (inode->i_nlink == 0) {
853 /* if the inode isn't linked anywhere,
854 * we don't need to worry about
855 * data=ordered
856 */
857 btrfs_del_ordered_inode(inode, 1);
858 }
859
860 btrfs_end_transaction_throttle(trans, root); 937 btrfs_end_transaction_throttle(trans, root);
861fail: 938fail:
862 btrfs_btree_balance_dirty(root, nr); 939 btrfs_btree_balance_dirty(root, nr);
@@ -931,6 +1008,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
931 int extent_type = -1; 1008 int extent_type = -1;
932 u64 mask = root->sectorsize - 1; 1009 u64 mask = root->sectorsize - 1;
933 1010
1011 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
934 btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1); 1012 btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
935 path = btrfs_alloc_path(); 1013 path = btrfs_alloc_path();
936 path->reada = -1; 1014 path->reada = -1;
@@ -1117,34 +1195,6 @@ error:
1117 return ret; 1195 return ret;
1118} 1196}
1119 1197
1120static int btrfs_cow_one_page(struct inode *inode, struct page *page,
1121 size_t zero_start)
1122{
1123 char *kaddr;
1124 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1125 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
1126 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
1127 int ret = 0;
1128
1129 WARN_ON(!PageLocked(page));
1130 set_page_extent_mapped(page);
1131
1132 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
1133 set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
1134 page_end, GFP_NOFS);
1135
1136 if (zero_start != PAGE_CACHE_SIZE) {
1137 kaddr = kmap(page);
1138 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
1139 flush_dcache_page(page);
1140 kunmap(page);
1141 }
1142 set_page_dirty(page);
1143 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1144
1145 return ret;
1146}
1147
1148/* 1198/*
1149 * taken from block_truncate_page, but does cow as it zeros out 1199 * taken from block_truncate_page, but does cow as it zeros out
1150 * any bytes left in the last page in the file. 1200 * any bytes left in the last page in the file.
@@ -1153,12 +1203,16 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
1153{ 1203{
1154 struct inode *inode = mapping->host; 1204 struct inode *inode = mapping->host;
1155 struct btrfs_root *root = BTRFS_I(inode)->root; 1205 struct btrfs_root *root = BTRFS_I(inode)->root;
1206 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1207 struct btrfs_ordered_extent *ordered;
1208 char *kaddr;
1156 u32 blocksize = root->sectorsize; 1209 u32 blocksize = root->sectorsize;
1157 pgoff_t index = from >> PAGE_CACHE_SHIFT; 1210 pgoff_t index = from >> PAGE_CACHE_SHIFT;
1158 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1211 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1159 struct page *page; 1212 struct page *page;
1160 int ret = 0; 1213 int ret = 0;
1161 u64 page_start; 1214 u64 page_start;
1215 u64 page_end;
1162 1216
1163 if ((offset & (blocksize - 1)) == 0) 1217 if ((offset & (blocksize - 1)) == 0)
1164 goto out; 1218 goto out;
@@ -1168,6 +1222,10 @@ again:
1168 page = grab_cache_page(mapping, index); 1222 page = grab_cache_page(mapping, index);
1169 if (!page) 1223 if (!page)
1170 goto out; 1224 goto out;
1225
1226 page_start = page_offset(page);
1227 page_end = page_start + PAGE_CACHE_SIZE - 1;
1228
1171 if (!PageUptodate(page)) { 1229 if (!PageUptodate(page)) {
1172 ret = btrfs_readpage(NULL, page); 1230 ret = btrfs_readpage(NULL, page);
1173 lock_page(page); 1231 lock_page(page);
@@ -1181,10 +1239,32 @@ again:
1181 goto out; 1239 goto out;
1182 } 1240 }
1183 } 1241 }
1184
1185 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
1186 wait_on_page_writeback(page); 1242 wait_on_page_writeback(page);
1187 ret = btrfs_cow_one_page(inode, page, offset); 1243
1244 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
1245 set_page_extent_mapped(page);
1246
1247 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1248 if (ordered) {
1249 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1250 unlock_page(page);
1251 page_cache_release(page);
1252 btrfs_wait_ordered_extent(inode, ordered);
1253 btrfs_put_ordered_extent(ordered);
1254 goto again;
1255 }
1256
1257 set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
1258 page_end, GFP_NOFS);
1259 ret = 0;
1260 if (offset != PAGE_CACHE_SIZE) {
1261 kaddr = kmap(page);
1262 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1263 flush_dcache_page(page);
1264 kunmap(page);
1265 }
1266 set_page_dirty(page);
1267 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1188 1268
1189 unlock_page(page); 1269 unlock_page(page);
1190 page_cache_release(page); 1270 page_cache_release(page);
@@ -1222,8 +1302,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
1222 1302
1223 btrfs_truncate_page(inode->i_mapping, inode->i_size); 1303 btrfs_truncate_page(inode->i_mapping, inode->i_size);
1224 1304
1225 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
1226 hole_size = block_end - hole_start; 1305 hole_size = block_end - hole_start;
1306 btrfs_wait_ordered_range(inode, hole_start, hole_size);
1307 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
1227 1308
1228 trans = btrfs_start_transaction(root, 1); 1309 trans = btrfs_start_transaction(root, 1);
1229 btrfs_set_trans_block_group(trans, inode); 1310 btrfs_set_trans_block_group(trans, inode);
@@ -1258,6 +1339,7 @@ void btrfs_delete_inode(struct inode *inode)
1258 unsigned long nr; 1339 unsigned long nr;
1259 int ret; 1340 int ret;
1260 1341
1342 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1261 truncate_inode_pages(&inode->i_data, 0); 1343 truncate_inode_pages(&inode->i_data, 0);
1262 if (is_bad_inode(inode)) { 1344 if (is_bad_inode(inode)) {
1263 goto no_delete; 1345 goto no_delete;
@@ -1403,7 +1485,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
1403 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 1485 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
1404 inode->i_mapping, GFP_NOFS); 1486 inode->i_mapping, GFP_NOFS);
1405 mutex_init(&BTRFS_I(inode)->csum_mutex); 1487 mutex_init(&BTRFS_I(inode)->csum_mutex);
1406 atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
1407 return 0; 1488 return 0;
1408} 1489}
1409 1490
@@ -1705,7 +1786,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
1705 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 1786 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
1706 inode->i_mapping, GFP_NOFS); 1787 inode->i_mapping, GFP_NOFS);
1707 mutex_init(&BTRFS_I(inode)->csum_mutex); 1788 mutex_init(&BTRFS_I(inode)->csum_mutex);
1708 atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
1709 BTRFS_I(inode)->delalloc_bytes = 0; 1789 BTRFS_I(inode)->delalloc_bytes = 0;
1710 BTRFS_I(inode)->root = root; 1790 BTRFS_I(inode)->root = root;
1711 1791
@@ -1930,7 +2010,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
1930 inode->i_mapping, GFP_NOFS); 2010 inode->i_mapping, GFP_NOFS);
1931 mutex_init(&BTRFS_I(inode)->csum_mutex); 2011 mutex_init(&BTRFS_I(inode)->csum_mutex);
1932 BTRFS_I(inode)->delalloc_bytes = 0; 2012 BTRFS_I(inode)->delalloc_bytes = 0;
1933 atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
1934 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2013 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
1935 } 2014 }
1936 dir->i_sb->s_dirt = 1; 2015 dir->i_sb->s_dirt = 1;
@@ -2066,64 +2145,18 @@ out_unlock:
2066 2145
2067static int merge_extent_mapping(struct extent_map_tree *em_tree, 2146static int merge_extent_mapping(struct extent_map_tree *em_tree,
2068 struct extent_map *existing, 2147 struct extent_map *existing,
2069 struct extent_map *em) 2148 struct extent_map *em,
2149 u64 map_start, u64 map_len)
2070{ 2150{
2071 u64 start_diff; 2151 u64 start_diff;
2072 u64 new_end;
2073 int ret = 0;
2074 int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
2075
2076 if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
2077 goto invalid;
2078
2079 if (!real_blocks && em->block_start != existing->block_start)
2080 goto invalid;
2081
2082 new_end = max(existing->start + existing->len, em->start + em->len);
2083
2084 if (existing->start >= em->start) {
2085 if (em->start + em->len < existing->start)
2086 goto invalid;
2087 2152
2088 start_diff = existing->start - em->start; 2153 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
2089 if (real_blocks && em->block_start + start_diff != 2154 start_diff = map_start - em->start;
2090 existing->block_start) 2155 em->start = map_start;
2091 goto invalid; 2156 em->len = map_len;
2092 2157 if (em->block_start < EXTENT_MAP_LAST_BYTE)
2093 em->len = new_end - em->start; 2158 em->block_start += start_diff;
2094 2159 return add_extent_mapping(em_tree, em);
2095 remove_extent_mapping(em_tree, existing);
2096 /* free for the tree */
2097 free_extent_map(existing);
2098 ret = add_extent_mapping(em_tree, em);
2099
2100 } else if (em->start > existing->start) {
2101
2102 if (existing->start + existing->len < em->start)
2103 goto invalid;
2104
2105 start_diff = em->start - existing->start;
2106 if (real_blocks && existing->block_start + start_diff !=
2107 em->block_start)
2108 goto invalid;
2109
2110 remove_extent_mapping(em_tree, existing);
2111 em->block_start = existing->block_start;
2112 em->start = existing->start;
2113 em->len = new_end - existing->start;
2114 free_extent_map(existing);
2115
2116 ret = add_extent_mapping(em_tree, em);
2117 } else {
2118 goto invalid;
2119 }
2120 return ret;
2121
2122invalid:
2123 printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
2124 existing->start, existing->len, existing->block_start,
2125 em->start, em->len, em->block_start);
2126 return -EIO;
2127} 2160}
2128 2161
2129struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2162struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2170,10 +2203,9 @@ again:
2170 err = -ENOMEM; 2203 err = -ENOMEM;
2171 goto out; 2204 goto out;
2172 } 2205 }
2173 2206 em->bdev = root->fs_info->fs_devices->latest_bdev;
2174 em->start = EXTENT_MAP_HOLE; 2207 em->start = EXTENT_MAP_HOLE;
2175 em->len = (u64)-1; 2208 em->len = (u64)-1;
2176 em->bdev = root->fs_info->fs_devices->latest_bdev;
2177 ret = btrfs_lookup_file_extent(trans, root, path, 2209 ret = btrfs_lookup_file_extent(trans, root, path,
2178 objectid, start, trans != NULL); 2210 objectid, start, trans != NULL);
2179 if (ret < 0) { 2211 if (ret < 0) {
@@ -2314,6 +2346,9 @@ insert:
2314 */ 2346 */
2315 if (ret == -EEXIST) { 2347 if (ret == -EEXIST) {
2316 struct extent_map *existing; 2348 struct extent_map *existing;
2349
2350 ret = 0;
2351
2317 existing = lookup_extent_mapping(em_tree, start, len); 2352 existing = lookup_extent_mapping(em_tree, start, len);
2318 if (existing && (existing->start > start || 2353 if (existing && (existing->start > start ||
2319 existing->start + existing->len <= start)) { 2354 existing->start + existing->len <= start)) {
@@ -2325,7 +2360,8 @@ insert:
2325 em->len); 2360 em->len);
2326 if (existing) { 2361 if (existing) {
2327 err = merge_extent_mapping(em_tree, existing, 2362 err = merge_extent_mapping(em_tree, existing,
2328 em); 2363 em, start,
2364 root->sectorsize);
2329 free_extent_map(existing); 2365 free_extent_map(existing);
2330 if (err) { 2366 if (err) {
2331 free_extent_map(em); 2367 free_extent_map(em);
@@ -2341,6 +2377,7 @@ insert:
2341 } else { 2377 } else {
2342 free_extent_map(em); 2378 free_extent_map(em);
2343 em = existing; 2379 em = existing;
2380 err = 0;
2344 } 2381 }
2345 } 2382 }
2346 spin_unlock(&em_tree->lock); 2383 spin_unlock(&em_tree->lock);
@@ -2348,8 +2385,9 @@ out:
2348 btrfs_free_path(path); 2385 btrfs_free_path(path);
2349 if (trans) { 2386 if (trans) {
2350 ret = btrfs_end_transaction(trans, root); 2387 ret = btrfs_end_transaction(trans, root);
2351 if (!err) 2388 if (!err) {
2352 err = ret; 2389 err = ret;
2390 }
2353 } 2391 }
2354 if (err) { 2392 if (err) {
2355 free_extent_map(em); 2393 free_extent_map(em);
@@ -2474,8 +2512,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
2474 return extent_readpages(tree, mapping, pages, nr_pages, 2512 return extent_readpages(tree, mapping, pages, nr_pages,
2475 btrfs_get_extent); 2513 btrfs_get_extent);
2476} 2514}
2477 2515static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
2478static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
2479{ 2516{
2480 struct extent_io_tree *tree; 2517 struct extent_io_tree *tree;
2481 struct extent_map_tree *map; 2518 struct extent_map_tree *map;
@@ -2493,15 +2530,54 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
2493 return ret; 2530 return ret;
2494} 2531}
2495 2532
2533static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
2534{
2535 struct btrfs_ordered_extent *ordered;
2536
2537 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
2538 page_offset(page));
2539 if (ordered) {
2540 btrfs_put_ordered_extent(ordered);
2541 return 0;
2542 }
2543 return __btrfs_releasepage(page, gfp_flags);
2544}
2545
2496static void btrfs_invalidatepage(struct page *page, unsigned long offset) 2546static void btrfs_invalidatepage(struct page *page, unsigned long offset)
2497{ 2547{
2498 struct extent_io_tree *tree; 2548 struct extent_io_tree *tree;
2549 struct btrfs_ordered_extent *ordered;
2550 u64 page_start = page_offset(page);
2551 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2499 2552
2553 wait_on_page_writeback(page);
2500 tree = &BTRFS_I(page->mapping->host)->io_tree; 2554 tree = &BTRFS_I(page->mapping->host)->io_tree;
2501 extent_invalidatepage(tree, page, offset); 2555 if (offset) {
2502 btrfs_releasepage(page, GFP_NOFS); 2556 btrfs_releasepage(page, GFP_NOFS);
2557 return;
2558 }
2559
2560 lock_extent(tree, page_start, page_end, GFP_NOFS);
2561 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
2562 page_offset(page));
2563 if (ordered) {
2564 clear_extent_bit(tree, page_start, page_end,
2565 EXTENT_DIRTY | EXTENT_DELALLOC |
2566 EXTENT_LOCKED, 1, 0, GFP_NOFS);
2567 btrfs_writepage_end_io_hook(page, page_start,
2568 page_end, NULL, 1);
2569 btrfs_put_ordered_extent(ordered);
2570 lock_extent(tree, page_start, page_end, GFP_NOFS);
2571 }
2572 clear_extent_bit(tree, page_start, page_end,
2573 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2574 EXTENT_ORDERED,
2575 1, 1, GFP_NOFS);
2576 __btrfs_releasepage(page, GFP_NOFS);
2577
2503 if (PagePrivate(page)) { 2578 if (PagePrivate(page)) {
2504 invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); 2579 invalidate_extent_lru(tree, page_offset(page),
2580 PAGE_CACHE_SIZE);
2505 ClearPagePrivate(page); 2581 ClearPagePrivate(page);
2506 set_page_private(page, 0); 2582 set_page_private(page, 0);
2507 page_cache_release(page); 2583 page_cache_release(page);
@@ -2527,35 +2603,63 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
2527{ 2603{
2528 struct inode *inode = fdentry(vma->vm_file)->d_inode; 2604 struct inode *inode = fdentry(vma->vm_file)->d_inode;
2529 struct btrfs_root *root = BTRFS_I(inode)->root; 2605 struct btrfs_root *root = BTRFS_I(inode)->root;
2530 unsigned long end; 2606 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2607 struct btrfs_ordered_extent *ordered;
2608 char *kaddr;
2609 unsigned long zero_start;
2531 loff_t size; 2610 loff_t size;
2532 int ret; 2611 int ret;
2533 u64 page_start; 2612 u64 page_start;
2613 u64 page_end;
2534 2614
2535 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); 2615 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
2536 if (ret) 2616 if (ret)
2537 goto out; 2617 goto out;
2538 2618
2539 ret = -EINVAL; 2619 ret = -EINVAL;
2540 2620again:
2541 lock_page(page); 2621 lock_page(page);
2542 wait_on_page_writeback(page);
2543 size = i_size_read(inode); 2622 size = i_size_read(inode);
2544 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2623 page_start = page_offset(page);
2624 page_end = page_start + PAGE_CACHE_SIZE - 1;
2545 2625
2546 if ((page->mapping != inode->i_mapping) || 2626 if ((page->mapping != inode->i_mapping) ||
2547 (page_start > size)) { 2627 (page_start >= size)) {
2548 /* page got truncated out from underneath us */ 2628 /* page got truncated out from underneath us */
2549 goto out_unlock; 2629 goto out_unlock;
2550 } 2630 }
2631 wait_on_page_writeback(page);
2632
2633 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2634 set_page_extent_mapped(page);
2635
2636 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2637 if (ordered) {
2638 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2639 unlock_page(page);
2640 btrfs_wait_ordered_extent(inode, ordered);
2641 btrfs_put_ordered_extent(ordered);
2642 goto again;
2643 }
2644
2645 set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
2646 page_end, GFP_NOFS);
2647 ret = 0;
2551 2648
2552 /* page is wholly or partially inside EOF */ 2649 /* page is wholly or partially inside EOF */
2553 if (page_start + PAGE_CACHE_SIZE > size) 2650 if (page_start + PAGE_CACHE_SIZE > size)
2554 end = size & ~PAGE_CACHE_MASK; 2651 zero_start = size & ~PAGE_CACHE_MASK;
2555 else 2652 else
2556 end = PAGE_CACHE_SIZE; 2653 zero_start = PAGE_CACHE_SIZE;
2557 2654
2558 ret = btrfs_cow_one_page(inode, page, end); 2655 if (zero_start != PAGE_CACHE_SIZE) {
2656 kaddr = kmap(page);
2657 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
2658 flush_dcache_page(page);
2659 kunmap(page);
2660 }
2661 set_page_dirty(page);
2662 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2559 2663
2560out_unlock: 2664out_unlock:
2561 unlock_page(page); 2665 unlock_page(page);
@@ -2662,15 +2766,28 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
2662 if (!ei) 2766 if (!ei)
2663 return NULL; 2767 return NULL;
2664 ei->last_trans = 0; 2768 ei->last_trans = 0;
2665 ei->ordered_trans = 0; 2769 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
2666 return &ei->vfs_inode; 2770 return &ei->vfs_inode;
2667} 2771}
2668 2772
2669void btrfs_destroy_inode(struct inode *inode) 2773void btrfs_destroy_inode(struct inode *inode)
2670{ 2774{
2775 struct btrfs_ordered_extent *ordered;
2671 WARN_ON(!list_empty(&inode->i_dentry)); 2776 WARN_ON(!list_empty(&inode->i_dentry));
2672 WARN_ON(inode->i_data.nrpages); 2777 WARN_ON(inode->i_data.nrpages);
2673 2778
2779 while(1) {
2780 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
2781 if (!ordered)
2782 break;
2783 else {
2784 printk("found ordered extent %Lu %Lu\n",
2785 ordered->file_offset, ordered->len);
2786 btrfs_remove_ordered_extent(inode, ordered);
2787 btrfs_put_ordered_extent(ordered);
2788 btrfs_put_ordered_extent(ordered);
2789 }
2790 }
2674 btrfs_drop_extent_cache(inode, 0, (u64)-1); 2791 btrfs_drop_extent_cache(inode, 0, (u64)-1);
2675 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 2792 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
2676} 2793}
@@ -2869,7 +2986,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
2869 inode->i_mapping, GFP_NOFS); 2986 inode->i_mapping, GFP_NOFS);
2870 mutex_init(&BTRFS_I(inode)->csum_mutex); 2987 mutex_init(&BTRFS_I(inode)->csum_mutex);
2871 BTRFS_I(inode)->delalloc_bytes = 0; 2988 BTRFS_I(inode)->delalloc_bytes = 0;
2872 atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
2873 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2989 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2874 } 2990 }
2875 dir->i_sb->s_dirt = 1; 2991 dir->i_sb->s_dirt = 1;
@@ -2921,6 +3037,20 @@ out_fail:
2921 return err; 3037 return err;
2922} 3038}
2923 3039
3040static int btrfs_set_page_dirty(struct page *page)
3041{
3042 struct inode *inode = page->mapping->host;
3043 u64 page_start = page_offset(page);
3044 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
3045
3046 if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3047 EXTENT_DELALLOC, 0)) {
3048printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
3049WARN_ON(1);
3050 }
3051 return __set_page_dirty_nobuffers(page);
3052}
3053
2924static int btrfs_permission(struct inode *inode, int mask, 3054static int btrfs_permission(struct inode *inode, int mask,
2925 struct nameidata *nd) 3055 struct nameidata *nd)
2926{ 3056{
@@ -2967,6 +3097,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
2967 .merge_bio_hook = btrfs_merge_bio_hook, 3097 .merge_bio_hook = btrfs_merge_bio_hook,
2968 .readpage_io_hook = btrfs_readpage_io_hook, 3098 .readpage_io_hook = btrfs_readpage_io_hook,
2969 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 3099 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
3100 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
2970 .readpage_io_failed_hook = btrfs_io_failed_hook, 3101 .readpage_io_failed_hook = btrfs_io_failed_hook,
2971 .set_bit_hook = btrfs_set_bit_hook, 3102 .set_bit_hook = btrfs_set_bit_hook,
2972 .clear_bit_hook = btrfs_clear_bit_hook, 3103 .clear_bit_hook = btrfs_clear_bit_hook,
@@ -2982,7 +3113,7 @@ static struct address_space_operations btrfs_aops = {
2982 .direct_IO = btrfs_direct_IO, 3113 .direct_IO = btrfs_direct_IO,
2983 .invalidatepage = btrfs_invalidatepage, 3114 .invalidatepage = btrfs_invalidatepage,
2984 .releasepage = btrfs_releasepage, 3115 .releasepage = btrfs_releasepage,
2985 .set_page_dirty = __set_page_dirty_nobuffers, 3116 .set_page_dirty = btrfs_set_page_dirty,
2986}; 3117};
2987 3118
2988static struct address_space_operations btrfs_symlink_aops = { 3119static struct address_space_operations btrfs_symlink_aops = {