aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-07-24 11:57:52 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:05 -0400
commit3eaa2885276fd6dac7b076a793932428b7168e74 (patch)
treeb06382bec68bf1755597a74ac8225f3bcddda5e5
parent64f26f745084872b916cd1bef6054e21b15c5784 (diff)
Btrfs: Fix the defragmention code and the block relocation code for data=ordered
Before setting an extent to delalloc, the code needs to wait for pending ordered extents. Also, the relocation code needs to wait for ordered IO before scanning the block group again. This is because the extents are not removed until the IO for the new extents is finished Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/disk-io.c3
-rw-r--r--fs/btrfs/extent-tree.c39
-rw-r--r--fs/btrfs/ioctl.c21
-rw-r--r--fs/btrfs/ordered-data.c56
-rw-r--r--fs/btrfs/ordered-data.h7
6 files changed, 113 insertions, 20 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8ecac2e77a43..6675e916ebcd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -539,6 +539,13 @@ struct btrfs_fs_info {
539 atomic_t nr_async_submits; 539 atomic_t nr_async_submits;
540 540
541 /* 541 /*
542 * this is used by the balancing code to wait for all the pending
543 * ordered extents
544 */
545 spinlock_t ordered_extent_lock;
546 struct list_head ordered_extents;
547
548 /*
542 * there is a pool of worker threads for checksumming during writes 549 * there is a pool of worker threads for checksumming during writes
543 * and a pool for checksumming after reads. This is because readers 550 * and a pool for checksumming after reads. This is because readers
544 * can run with FS locks held, and the writers may be waiting for 551 * can run with FS locks held, and the writers may be waiting for
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7ce3f83c5dd6..ec01062eb41d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1252,6 +1252,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1252 fs_info->btree_inode->i_nlink = 1; 1252 fs_info->btree_inode->i_nlink = 1;
1253 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); 1253 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
1254 1254
1255 INIT_LIST_HEAD(&fs_info->ordered_extents);
1256 spin_lock_init(&fs_info->ordered_extent_lock);
1257
1255 sb->s_blocksize = 4096; 1258 sb->s_blocksize = 4096;
1256 sb->s_blocksize_bits = blksize_bits(4096); 1259 sb->s_blocksize_bits = blksize_bits(4096);
1257 1260
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index febc6295c7a9..f92b297e7da5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2640,6 +2640,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2640 struct file_ra_state *ra; 2640 struct file_ra_state *ra;
2641 unsigned long total_read = 0; 2641 unsigned long total_read = 0;
2642 unsigned long ra_pages; 2642 unsigned long ra_pages;
2643 struct btrfs_ordered_extent *ordered;
2643 struct btrfs_trans_handle *trans; 2644 struct btrfs_trans_handle *trans;
2644 2645
2645 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2646 ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2658,9 +2659,9 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2658 calc_ra(i, last_index, ra_pages)); 2659 calc_ra(i, last_index, ra_pages));
2659 } 2660 }
2660 total_read++; 2661 total_read++;
2661 if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size) 2662again:
2663 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2662 goto truncate_racing; 2664 goto truncate_racing;
2663
2664 page = grab_cache_page(inode->i_mapping, i); 2665 page = grab_cache_page(inode->i_mapping, i);
2665 if (!page) { 2666 if (!page) {
2666 goto out_unlock; 2667 goto out_unlock;
@@ -2674,18 +2675,24 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2674 goto out_unlock; 2675 goto out_unlock;
2675 } 2676 }
2676 } 2677 }
2677#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2678 ClearPageDirty(page);
2679#else
2680 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2681#endif
2682 wait_on_page_writeback(page); 2678 wait_on_page_writeback(page);
2683 set_page_extent_mapped(page); 2679
2684 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2680 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2685 page_end = page_start + PAGE_CACHE_SIZE - 1; 2681 page_end = page_start + PAGE_CACHE_SIZE - 1;
2686
2687 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 2682 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2688 2683
2684 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2685 if (ordered) {
2686 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2687 unlock_page(page);
2688 page_cache_release(page);
2689 btrfs_start_ordered_extent(inode, ordered, 1);
2690 btrfs_put_ordered_extent(ordered);
2691 goto again;
2692 }
2693 set_page_extent_mapped(page);
2694
2695
2689 set_extent_delalloc(io_tree, page_start, 2696 set_extent_delalloc(io_tree, page_start,
2690 page_end, GFP_NOFS); 2697 page_end, GFP_NOFS);
2691 set_page_dirty(page); 2698 set_page_dirty(page);
@@ -2694,10 +2701,18 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2694 unlock_page(page); 2701 unlock_page(page);
2695 page_cache_release(page); 2702 page_cache_release(page);
2696 } 2703 }
2697 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2698 total_read);
2699 2704
2700out_unlock: 2705out_unlock:
2706 /* we have to start the IO in order to get the ordered extents
2707 * instantiated. This allows the relocation to code to wait
2708 * for all the ordered extents to hit the disk.
2709 *
2710 * Otherwise, it would constantly loop over the same extents
2711 * because the old ones don't get deleted until the IO is
2712 * started
2713 */
2714 btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
2715 WB_SYNC_NONE);
2701 kfree(ra); 2716 kfree(ra);
2702 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); 2717 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
2703 if (trans) { 2718 if (trans) {
@@ -3238,6 +3253,8 @@ next:
3238 3253
3239 btrfs_clean_old_snapshots(tree_root); 3254 btrfs_clean_old_snapshots(tree_root);
3240 3255
3256 btrfs_wait_ordered_extents(tree_root);
3257
3241 trans = btrfs_start_transaction(tree_root, 1); 3258 trans = btrfs_start_transaction(tree_root, 1);
3242 btrfs_commit_transaction(trans, tree_root); 3259 btrfs_commit_transaction(trans, tree_root);
3243 mutex_lock(&root->fs_info->alloc_mutex); 3260 mutex_lock(&root->fs_info->alloc_mutex);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 83f17a5cbd6a..a61f2e7e2db5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -213,6 +213,7 @@ int btrfs_defrag_file(struct file *file)
213 struct inode *inode = fdentry(file)->d_inode; 213 struct inode *inode = fdentry(file)->d_inode;
214 struct btrfs_root *root = BTRFS_I(inode)->root; 214 struct btrfs_root *root = BTRFS_I(inode)->root;
215 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 215 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
216 struct btrfs_ordered_extent *ordered;
216 struct page *page; 217 struct page *page;
217 unsigned long last_index; 218 unsigned long last_index;
218 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 219 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
@@ -234,6 +235,7 @@ int btrfs_defrag_file(struct file *file)
234 min(last_index, i + ra_pages - 1)); 235 min(last_index, i + ra_pages - 1));
235 } 236 }
236 total_read++; 237 total_read++;
238again:
237 page = grab_cache_page(inode->i_mapping, i); 239 page = grab_cache_page(inode->i_mapping, i);
238 if (!page) 240 if (!page)
239 goto out_unlock; 241 goto out_unlock;
@@ -247,18 +249,23 @@ int btrfs_defrag_file(struct file *file)
247 } 249 }
248 } 250 }
249 251
250#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
251 ClearPageDirty(page);
252#else
253 cancel_dirty_page(page, PAGE_CACHE_SIZE);
254#endif
255 wait_on_page_writeback(page); 252 wait_on_page_writeback(page);
256 set_page_extent_mapped(page);
257 253
258 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 254 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
259 page_end = page_start + PAGE_CACHE_SIZE - 1; 255 page_end = page_start + PAGE_CACHE_SIZE - 1;
260
261 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 256 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
257
258 ordered = btrfs_lookup_ordered_extent(inode, page_start);
259 if (ordered) {
260 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
261 unlock_page(page);
262 page_cache_release(page);
263 btrfs_start_ordered_extent(inode, ordered, 1);
264 btrfs_put_ordered_extent(ordered);
265 goto again;
266 }
267 set_page_extent_mapped(page);
268
262 set_extent_delalloc(io_tree, page_start, 269 set_extent_delalloc(io_tree, page_start,
263 page_end, GFP_NOFS); 270 page_end, GFP_NOFS);
264 271
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e42fd233e04c..676e4bd65c52 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -167,20 +167,28 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
167 entry->file_offset = file_offset; 167 entry->file_offset = file_offset;
168 entry->start = start; 168 entry->start = start;
169 entry->len = len; 169 entry->len = len;
170 entry->inode = inode;
171
170 /* one ref for the tree */ 172 /* one ref for the tree */
171 atomic_set(&entry->refs, 1); 173 atomic_set(&entry->refs, 1);
172 init_waitqueue_head(&entry->wait); 174 init_waitqueue_head(&entry->wait);
173 INIT_LIST_HEAD(&entry->list); 175 INIT_LIST_HEAD(&entry->list);
176 INIT_LIST_HEAD(&entry->root_extent_list);
174 177
175 node = tree_insert(&tree->tree, file_offset, 178 node = tree_insert(&tree->tree, file_offset,
176 &entry->rb_node); 179 &entry->rb_node);
177 if (node) { 180 if (node) {
178 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 181 printk("warning dup entry from add_ordered_extent\n");
179 atomic_inc(&entry->refs); 182 BUG();
180 } 183 }
181 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, 184 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
182 entry_end(entry) - 1, GFP_NOFS); 185 entry_end(entry) - 1, GFP_NOFS);
183 186
187 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
188 list_add_tail(&entry->root_extent_list,
189 &BTRFS_I(inode)->root->fs_info->ordered_extents);
190 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
191
184 mutex_unlock(&tree->mutex); 192 mutex_unlock(&tree->mutex);
185 BUG_ON(node); 193 BUG_ON(node);
186 return 0; 194 return 0;
@@ -285,11 +293,55 @@ int btrfs_remove_ordered_extent(struct inode *inode,
285 rb_erase(node, &tree->tree); 293 rb_erase(node, &tree->tree);
286 tree->last = NULL; 294 tree->last = NULL;
287 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 295 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
296
297 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
298 list_del_init(&entry->root_extent_list);
299 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
300
288 mutex_unlock(&tree->mutex); 301 mutex_unlock(&tree->mutex);
289 wake_up(&entry->wait); 302 wake_up(&entry->wait);
290 return 0; 303 return 0;
291} 304}
292 305
306int btrfs_wait_ordered_extents(struct btrfs_root *root)
307{
308 struct list_head splice;
309 struct list_head *cur;
310 struct btrfs_ordered_extent *ordered;
311 struct inode *inode;
312
313 INIT_LIST_HEAD(&splice);
314
315 spin_lock(&root->fs_info->ordered_extent_lock);
316 list_splice_init(&root->fs_info->ordered_extents, &splice);
317 while(!list_empty(&splice)) {
318 cur = splice.next;
319 ordered = list_entry(cur, struct btrfs_ordered_extent,
320 root_extent_list);
321 list_del_init(&ordered->root_extent_list);
322 atomic_inc(&ordered->refs);
323 inode = ordered->inode;
324
325 /*
326 * the inode can't go away until all the pages are gone
327 * and the pages won't go away while there is still
328 * an ordered extent and the ordered extent won't go
329 * away until it is off this list. So, we can safely
330 * increment i_count here and call iput later
331 */
332 atomic_inc(&inode->i_count);
333 spin_unlock(&root->fs_info->ordered_extent_lock);
334
335 btrfs_start_ordered_extent(inode, ordered, 1);
336 btrfs_put_ordered_extent(ordered);
337 iput(inode);
338
339 spin_lock(&root->fs_info->ordered_extent_lock);
340 }
341 spin_unlock(&root->fs_info->ordered_extent_lock);
342 return 0;
343}
344
293/* 345/*
294 * Used to start IO or wait for a given ordered extent to finish. 346 * Used to start IO or wait for a given ordered extent to finish.
295 * 347 *
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 199cb0b4f1d9..5efe6b63c74c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -80,6 +80,9 @@ struct btrfs_ordered_extent {
80 /* reference count */ 80 /* reference count */
81 atomic_t refs; 81 atomic_t refs;
82 82
83 /* the inode we belong to */
84 struct inode *inode;
85
83 /* list of checksums for insertion when the extent io is done */ 86 /* list of checksums for insertion when the extent io is done */
84 struct list_head list; 87 struct list_head list;
85 88
@@ -88,6 +91,9 @@ struct btrfs_ordered_extent {
88 91
89 /* our friendly rbtree entry */ 92 /* our friendly rbtree entry */
90 struct rb_node rb_node; 93 struct rb_node rb_node;
94
95 /* a per root list of all the pending ordered extents */
96 struct list_head root_extent_list;
91}; 97};
92 98
93 99
@@ -137,4 +143,5 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
137 pgoff_t start, pgoff_t end); 143 pgoff_t start, pgoff_t end);
138int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 144int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
139 loff_t end, int sync_mode); 145 loff_t end, int sync_mode);
146int btrfs_wait_ordered_extents(struct btrfs_root *root);
140#endif 147#endif