aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-05-24 15:35:30 -0400
committerChris Mason <chris.mason@oracle.com>2011-05-26 17:52:15 -0400
commit4cb5300bc839b8a943eb19c9f27f25470e22d0ca (patch)
treeac0f2fb481c7aa6af08a624d276fa6d580c94c9b
parentd6c0cb379c5198487e4ac124728cbb2346d63b1f (diff)
Btrfs: add mount -o auto_defrag
This will detect small random writes into files and queue the up for an auto defrag process. It isn't well suited to database workloads yet, but works for smaller files such as rpm, sqlite or bdb databases. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/btrfs_inode.h1
-rw-r--r--fs/btrfs/ctree.h45
-rw-r--r--fs/btrfs/disk-io.c12
-rw-r--r--fs/btrfs/file.c257
-rw-r--r--fs/btrfs/inode.c12
-rw-r--r--fs/btrfs/ioctl.c448
-rw-r--r--fs/btrfs/ioctl.h31
-rw-r--r--fs/btrfs/super.c7
8 files changed, 678 insertions, 135 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d0b0e43a6a8b..93b1aa932014 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,7 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 154 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
156 unsigned in_defrag:1;
156 157
157 /* 158 /*
158 * always compress this one file 159 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 026fc47b42cf..332323e19dd1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1074,6 +1074,11 @@ struct btrfs_fs_info {
1074 /* all metadata allocations go through this cluster */ 1074 /* all metadata allocations go through this cluster */
1075 struct btrfs_free_cluster meta_alloc_cluster; 1075 struct btrfs_free_cluster meta_alloc_cluster;
1076 1076
1077 /* auto defrag inodes go here */
1078 spinlock_t defrag_inodes_lock;
1079 struct rb_root defrag_inodes;
1080 atomic_t defrag_running;
1081
1077 spinlock_t ref_cache_lock; 1082 spinlock_t ref_cache_lock;
1078 u64 total_ref_cache_size; 1083 u64 total_ref_cache_size;
1079 1084
@@ -1205,6 +1210,38 @@ struct btrfs_root {
1205 struct super_block anon_super; 1210 struct super_block anon_super;
1206}; 1211};
1207 1212
1213struct btrfs_ioctl_defrag_range_args {
1214 /* start of the defrag operation */
1215 __u64 start;
1216
1217 /* number of bytes to defrag, use (u64)-1 to say all */
1218 __u64 len;
1219
1220 /*
1221 * flags for the operation, which can include turning
1222 * on compression for this one defrag
1223 */
1224 __u64 flags;
1225
1226 /*
1227 * any extent bigger than this will be considered
1228 * already defragged. Use 0 to take the kernel default
1229 * Use 1 to say every single extent must be rewritten
1230 */
1231 __u32 extent_thresh;
1232
1233 /*
1234 * which compression method to use if turning on compression
1235 * for this defrag operation. If unspecified, zlib will
1236 * be used
1237 */
1238 __u32 compress_type;
1239
1240 /* spare for later */
1241 __u32 unused[4];
1242};
1243
1244
1208/* 1245/*
1209 * inode items have the data typically returned from stat and store other 1246 * inode items have the data typically returned from stat and store other
1210 * info about object characteristics. There is one for every file and dir in 1247 * info about object characteristics. There is one for every file and dir in
@@ -1302,6 +1339,7 @@ struct btrfs_root {
1302#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1339#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1303#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1340#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1304#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1341#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1342#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1305 1343
1306#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1344#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1307#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1345#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2528,8 +2566,13 @@ extern const struct dentry_operations btrfs_dentry_operations;
2528long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2566long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2529void btrfs_update_iflags(struct inode *inode); 2567void btrfs_update_iflags(struct inode *inode);
2530void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2568void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2531 2569int btrfs_defrag_file(struct inode *inode, struct file *file,
2570 struct btrfs_ioctl_defrag_range_args *range,
2571 u64 newer_than, unsigned long max_pages);
2532/* file.c */ 2572/* file.c */
2573int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
2574 struct inode *inode);
2575int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
2533int btrfs_sync_file(struct file *file, int datasync); 2576int btrfs_sync_file(struct file *file, int datasync);
2534int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2577int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2535 int skip_pinned); 2578 int skip_pinned);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 16d335b342a2..b2588a552658 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1475,6 +1475,7 @@ static int cleaner_kthread(void *arg)
1475 btrfs_run_delayed_iputs(root); 1475 btrfs_run_delayed_iputs(root);
1476 btrfs_clean_old_snapshots(root); 1476 btrfs_clean_old_snapshots(root);
1477 mutex_unlock(&root->fs_info->cleaner_mutex); 1477 mutex_unlock(&root->fs_info->cleaner_mutex);
1478 btrfs_run_defrag_inodes(root->fs_info);
1478 } 1479 }
1479 1480
1480 if (freezing(current)) { 1481 if (freezing(current)) {
@@ -1616,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1616 spin_lock_init(&fs_info->ref_cache_lock); 1617 spin_lock_init(&fs_info->ref_cache_lock);
1617 spin_lock_init(&fs_info->fs_roots_radix_lock); 1618 spin_lock_init(&fs_info->fs_roots_radix_lock);
1618 spin_lock_init(&fs_info->delayed_iput_lock); 1619 spin_lock_init(&fs_info->delayed_iput_lock);
1620 spin_lock_init(&fs_info->defrag_inodes_lock);
1619 1621
1620 init_completion(&fs_info->kobj_unregister); 1622 init_completion(&fs_info->kobj_unregister);
1621 fs_info->tree_root = tree_root; 1623 fs_info->tree_root = tree_root;
@@ -1638,9 +1640,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1638 atomic_set(&fs_info->async_delalloc_pages, 0); 1640 atomic_set(&fs_info->async_delalloc_pages, 0);
1639 atomic_set(&fs_info->async_submit_draining, 0); 1641 atomic_set(&fs_info->async_submit_draining, 0);
1640 atomic_set(&fs_info->nr_async_bios, 0); 1642 atomic_set(&fs_info->nr_async_bios, 0);
1643 atomic_set(&fs_info->defrag_running, 0);
1641 fs_info->sb = sb; 1644 fs_info->sb = sb;
1642 fs_info->max_inline = 8192 * 1024; 1645 fs_info->max_inline = 8192 * 1024;
1643 fs_info->metadata_ratio = 0; 1646 fs_info->metadata_ratio = 0;
1647 fs_info->defrag_inodes = RB_ROOT;
1644 1648
1645 fs_info->thread_pool_size = min_t(unsigned long, 1649 fs_info->thread_pool_size = min_t(unsigned long,
1646 num_online_cpus() + 2, 8); 1650 num_online_cpus() + 2, 8);
@@ -2501,6 +2505,14 @@ int close_ctree(struct btrfs_root *root)
2501 smp_mb(); 2505 smp_mb();
2502 2506
2503 btrfs_scrub_cancel(root); 2507 btrfs_scrub_cancel(root);
2508
2509 /* wait for any defraggers to finish */
2510 wait_event(fs_info->transaction_wait,
2511 (atomic_read(&fs_info->defrag_running) == 0));
2512
2513 /* clear out the rbtree of defraggable inodes */
2514 btrfs_run_defrag_inodes(root->fs_info);
2515
2504 btrfs_put_block_group_cache(fs_info); 2516 btrfs_put_block_group_cache(fs_info);
2505 2517
2506 /* 2518 /*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58ddc4442159..c6a22d783c35 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,263 @@
40#include "locking.h" 40#include "locking.h"
41#include "compat.h" 41#include "compat.h"
42 42
43/*
44 * when auto defrag is enabled we
45 * queue up these defrag structs to remember which
46 * inodes need defragging passes
47 */
48struct inode_defrag {
49 struct rb_node rb_node;
50 /* objectid */
51 u64 ino;
52 /*
53 * transid where the defrag was added, we search for
54 * extents newer than this
55 */
56 u64 transid;
57
58 /* root objectid */
59 u64 root;
60
61 /* last offset we were able to defrag */
62 u64 last_offset;
63
64 /* if we've wrapped around back to zero once already */
65 int cycled;
66};
67
68/* pop a record for an inode into the defrag tree. The lock
69 * must be held already
70 *
71 * If you're inserting a record for an older transid than an
72 * existing record, the transid already in the tree is lowered
73 *
74 * If an existing record is found the defrag item you
75 * pass in is freed
76 */
77static int __btrfs_add_inode_defrag(struct inode *inode,
78 struct inode_defrag *defrag)
79{
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct inode_defrag *entry;
82 struct rb_node **p;
83 struct rb_node *parent = NULL;
84
85 p = &root->fs_info->defrag_inodes.rb_node;
86 while (*p) {
87 parent = *p;
88 entry = rb_entry(parent, struct inode_defrag, rb_node);
89
90 if (defrag->ino < entry->ino)
91 p = &parent->rb_left;
92 else if (defrag->ino > entry->ino)
93 p = &parent->rb_right;
94 else {
95 /* if we're reinserting an entry for
96 * an old defrag run, make sure to
97 * lower the transid of our existing record
98 */
99 if (defrag->transid < entry->transid)
100 entry->transid = defrag->transid;
101 if (defrag->last_offset > entry->last_offset)
102 entry->last_offset = defrag->last_offset;
103 goto exists;
104 }
105 }
106 BTRFS_I(inode)->in_defrag = 1;
107 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return 0;
110
111exists:
112 kfree(defrag);
113 return 0;
114
115}
116
117/*
118 * insert a defrag record for this inode if auto defrag is
119 * enabled
120 */
121int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
122 struct inode *inode)
123{
124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct inode_defrag *defrag;
126 int ret = 0;
127 u64 transid;
128
129 if (!btrfs_test_opt(root, AUTO_DEFRAG))
130 return 0;
131
132 if (root->fs_info->closing)
133 return 0;
134
135 if (BTRFS_I(inode)->in_defrag)
136 return 0;
137
138 if (trans)
139 transid = trans->transid;
140 else
141 transid = BTRFS_I(inode)->root->last_trans;
142
143 defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
144 if (!defrag)
145 return -ENOMEM;
146
147 defrag->ino = inode->i_ino;
148 defrag->transid = transid;
149 defrag->root = root->root_key.objectid;
150
151 spin_lock(&root->fs_info->defrag_inodes_lock);
152 if (!BTRFS_I(inode)->in_defrag)
153 ret = __btrfs_add_inode_defrag(inode, defrag);
154 spin_unlock(&root->fs_info->defrag_inodes_lock);
155 return ret;
156}
157
158/*
159 * must be called with the defrag_inodes lock held
160 */
161struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
162 struct rb_node **next)
163{
164 struct inode_defrag *entry = NULL;
165 struct rb_node *p;
166 struct rb_node *parent = NULL;
167
168 p = info->defrag_inodes.rb_node;
169 while (p) {
170 parent = p;
171 entry = rb_entry(parent, struct inode_defrag, rb_node);
172
173 if (ino < entry->ino)
174 p = parent->rb_left;
175 else if (ino > entry->ino)
176 p = parent->rb_right;
177 else
178 return entry;
179 }
180
181 if (next) {
182 while (parent && ino > entry->ino) {
183 parent = rb_next(parent);
184 entry = rb_entry(parent, struct inode_defrag, rb_node);
185 }
186 *next = parent;
187 }
188 return NULL;
189}
190
191/*
192 * run through the list of inodes in the FS that need
193 * defragging
194 */
195int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
196{
197 struct inode_defrag *defrag;
198 struct btrfs_root *inode_root;
199 struct inode *inode;
200 struct rb_node *n;
201 struct btrfs_key key;
202 struct btrfs_ioctl_defrag_range_args range;
203 u64 first_ino = 0;
204 int num_defrag;
205 int defrag_batch = 1024;
206
207 memset(&range, 0, sizeof(range));
208 range.len = (u64)-1;
209
210 atomic_inc(&fs_info->defrag_running);
211 spin_lock(&fs_info->defrag_inodes_lock);
212 while(1) {
213 n = NULL;
214
215 /* find an inode to defrag */
216 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
217 if (!defrag) {
218 if (n)
219 defrag = rb_entry(n, struct inode_defrag, rb_node);
220 else if (first_ino) {
221 first_ino = 0;
222 continue;
223 } else {
224 break;
225 }
226 }
227
228 /* remove it from the rbtree */
229 first_ino = defrag->ino + 1;
230 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
231
232 if (fs_info->closing)
233 goto next_free;
234
235 spin_unlock(&fs_info->defrag_inodes_lock);
236
237 /* get the inode */
238 key.objectid = defrag->root;
239 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
240 key.offset = (u64)-1;
241 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
242 if (IS_ERR(inode_root))
243 goto next;
244
245 key.objectid = defrag->ino;
246 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
247 key.offset = 0;
248
249 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
250 if (IS_ERR(inode))
251 goto next;
252
253 /* do a chunk of defrag */
254 BTRFS_I(inode)->in_defrag = 0;
255 range.start = defrag->last_offset;
256 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
257 defrag_batch);
258 /*
259 * if we filled the whole defrag batch, there
260 * must be more work to do. Queue this defrag
261 * again
262 */
263 if (num_defrag == defrag_batch) {
264 defrag->last_offset = range.start;
265 __btrfs_add_inode_defrag(inode, defrag);
266 /*
267 * we don't want to kfree defrag, we added it back to
268 * the rbtree
269 */
270 defrag = NULL;
271 } else if (defrag->last_offset && !defrag->cycled) {
272 /*
273 * we didn't fill our defrag batch, but
274 * we didn't start at zero. Make sure we loop
275 * around to the start of the file.
276 */
277 defrag->last_offset = 0;
278 defrag->cycled = 1;
279 __btrfs_add_inode_defrag(inode, defrag);
280 defrag = NULL;
281 }
282
283 iput(inode);
284next:
285 spin_lock(&fs_info->defrag_inodes_lock);
286next_free:
287 kfree(defrag);
288 }
289 spin_unlock(&fs_info->defrag_inodes_lock);
290
291 atomic_dec(&fs_info->defrag_running);
292
293 /*
294 * during unmount, we use the transaction_wait queue to
295 * wait for the defragger to stop
296 */
297 wake_up(&fs_info->transaction_wait);
298 return 0;
299}
43 300
44/* simple helper to fault in pages and copy. This should go away 301/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code. 302 * and be replaced with calls into generic code.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d378f8b70ef7..bb51bb1fa44f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -342,6 +342,10 @@ static noinline int compress_file_range(struct inode *inode,
342 int will_compress; 342 int will_compress;
343 int compress_type = root->fs_info->compress_type; 343 int compress_type = root->fs_info->compress_type;
344 344
345 /* if this is a small write inside eof, kick off a defragbot */
346 if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
347 btrfs_add_inode_defrag(NULL, inode);
348
345 actual_end = min_t(u64, isize, end + 1); 349 actual_end = min_t(u64, isize, end + 1);
346again: 350again:
347 will_compress = 0; 351 will_compress = 0;
@@ -799,6 +803,10 @@ static noinline int cow_file_range(struct inode *inode,
799 disk_num_bytes = num_bytes; 803 disk_num_bytes = num_bytes;
800 ret = 0; 804 ret = 0;
801 805
806 /* if this is a small write inside eof, kick off defrag */
807 if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
808 btrfs_add_inode_defrag(trans, inode);
809
802 if (start == 0) { 810 if (start == 0) {
803 /* lets try to make an inline extent */ 811 /* lets try to make an inline extent */
804 ret = cow_file_range_inline(trans, root, inode, 812 ret = cow_file_range_inline(trans, root, inode,
@@ -5371,6 +5379,9 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5371 if (IS_ERR(trans)) 5379 if (IS_ERR(trans))
5372 return ERR_CAST(trans); 5380 return ERR_CAST(trans);
5373 5381
5382 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5383 btrfs_add_inode_defrag(trans, inode);
5384
5374 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5385 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5375 5386
5376 alloc_hint = get_extent_allocation_hint(inode, start, len); 5387 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -6682,6 +6693,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6682 ei->ordered_data_close = 0; 6693 ei->ordered_data_close = 0;
6683 ei->orphan_meta_reserved = 0; 6694 ei->orphan_meta_reserved = 0;
6684 ei->dummy_inode = 0; 6695 ei->dummy_inode = 0;
6696 ei->in_defrag = 0;
6685 ei->force_compress = BTRFS_COMPRESS_NONE; 6697 ei->force_compress = BTRFS_COMPRESS_NONE;
6686 6698
6687 ei->delayed_node = NULL; 6699 ei->delayed_node = NULL;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c4f17e4e2c9c..85e818ce00c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -656,6 +656,106 @@ out_unlock:
656 return error; 656 return error;
657} 657}
658 658
659/*
660 * When we're defragging a range, we don't want to kick it off again
661 * if it is really just waiting for delalloc to send it down.
662 * If we find a nice big extent or delalloc range for the bytes in the
663 * file you want to defrag, we return 0 to let you know to skip this
664 * part of the file
665 */
666static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
667{
668 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
669 struct extent_map *em = NULL;
670 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
671 u64 end;
672
673 read_lock(&em_tree->lock);
674 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
675 read_unlock(&em_tree->lock);
676
677 if (em) {
678 end = extent_map_end(em);
679 free_extent_map(em);
680 if (end - offset > thresh)
681 return 0;
682 }
683 /* if we already have a nice delalloc here, just stop */
684 thresh /= 2;
685 end = count_range_bits(io_tree, &offset, offset + thresh,
686 thresh, EXTENT_DELALLOC, 1);
687 if (end >= thresh)
688 return 0;
689 return 1;
690}
691
692/*
693 * helper function to walk through a file and find extents
694 * newer than a specific transid, and smaller than thresh.
695 *
696 * This is used by the defragging code to find new and small
697 * extents
698 */
699static int find_new_extents(struct btrfs_root *root,
700 struct inode *inode, u64 newer_than,
701 u64 *off, int thresh)
702{
703 struct btrfs_path *path;
704 struct btrfs_key min_key;
705 struct btrfs_key max_key;
706 struct extent_buffer *leaf;
707 struct btrfs_file_extent_item *extent;
708 int type;
709 int ret;
710
711 path = btrfs_alloc_path();
712 if (!path)
713 return -ENOMEM;
714
715 min_key.objectid = inode->i_ino;
716 min_key.type = BTRFS_EXTENT_DATA_KEY;
717 min_key.offset = *off;
718
719 max_key.objectid = inode->i_ino;
720 max_key.type = (u8)-1;
721 max_key.offset = (u64)-1;
722
723 path->keep_locks = 1;
724
725 while(1) {
726 ret = btrfs_search_forward(root, &min_key, &max_key,
727 path, 0, newer_than);
728 if (ret != 0)
729 goto none;
730 if (min_key.objectid != inode->i_ino)
731 goto none;
732 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
733 goto none;
734
735 leaf = path->nodes[0];
736 extent = btrfs_item_ptr(leaf, path->slots[0],
737 struct btrfs_file_extent_item);
738
739 type = btrfs_file_extent_type(leaf, extent);
740 if (type == BTRFS_FILE_EXTENT_REG &&
741 btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
742 check_defrag_in_cache(inode, min_key.offset, thresh)) {
743 *off = min_key.offset;
744 btrfs_free_path(path);
745 return 0;
746 }
747
748 if (min_key.offset == (u64)-1)
749 goto none;
750
751 min_key.offset++;
752 btrfs_release_path(path);
753 }
754none:
755 btrfs_free_path(path);
756 return -ENOENT;
757}
758
659static int should_defrag_range(struct inode *inode, u64 start, u64 len, 759static int should_defrag_range(struct inode *inode, u64 start, u64 len,
660 int thresh, u64 *last_len, u64 *skip, 760 int thresh, u64 *last_len, u64 *skip,
661 u64 *defrag_end) 761 u64 *defrag_end)
@@ -665,10 +765,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
665 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 765 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
666 int ret = 1; 766 int ret = 1;
667 767
668
669 if (thresh == 0)
670 thresh = 256 * 1024;
671
672 /* 768 /*
673 * make sure that once we start defragging and extent, we keep on 769 * make sure that once we start defragging and extent, we keep on
674 * defragging it 770 * defragging it
@@ -727,27 +823,176 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
727 return ret; 823 return ret;
728} 824}
729 825
730static int btrfs_defrag_file(struct file *file, 826/*
731 struct btrfs_ioctl_defrag_range_args *range) 827 * it doesn't do much good to defrag one or two pages
828 * at a time. This pulls in a nice chunk of pages
829 * to COW and defrag.
830 *
831 * It also makes sure the delalloc code has enough
832 * dirty data to avoid making new small extents as part
833 * of the defrag
834 *
835 * It's a good idea to start RA on this range
836 * before calling this.
837 */
838static int cluster_pages_for_defrag(struct inode *inode,
839 struct page **pages,
840 unsigned long start_index,
841 int num_pages)
732{ 842{
733 struct inode *inode = fdentry(file)->d_inode; 843 unsigned long file_end;
734 struct btrfs_root *root = BTRFS_I(inode)->root; 844 u64 isize = i_size_read(inode);
735 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 845 u64 page_start;
846 u64 page_end;
847 int ret;
848 int i;
849 int i_done;
736 struct btrfs_ordered_extent *ordered; 850 struct btrfs_ordered_extent *ordered;
737 struct page *page; 851 struct extent_state *cached_state = NULL;
852
853 if (isize == 0)
854 return 0;
855 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
856
857 ret = btrfs_delalloc_reserve_space(inode,
858 num_pages << PAGE_CACHE_SHIFT);
859 if (ret)
860 return ret;
861again:
862 ret = 0;
863 i_done = 0;
864
865 /* step one, lock all the pages */
866 for (i = 0; i < num_pages; i++) {
867 struct page *page;
868 page = grab_cache_page(inode->i_mapping,
869 start_index + i);
870 if (!page)
871 break;
872
873 if (!PageUptodate(page)) {
874 btrfs_readpage(NULL, page);
875 lock_page(page);
876 if (!PageUptodate(page)) {
877 unlock_page(page);
878 page_cache_release(page);
879 ret = -EIO;
880 break;
881 }
882 }
883 isize = i_size_read(inode);
884 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
885 if (!isize || page->index > file_end ||
886 page->mapping != inode->i_mapping) {
887 /* whoops, we blew past eof, skip this page */
888 unlock_page(page);
889 page_cache_release(page);
890 break;
891 }
892 pages[i] = page;
893 i_done++;
894 }
895 if (!i_done || ret)
896 goto out;
897
898 if (!(inode->i_sb->s_flags & MS_ACTIVE))
899 goto out;
900
901 /*
902 * so now we have a nice long stream of locked
903 * and up to date pages, lets wait on them
904 */
905 for (i = 0; i < i_done; i++)
906 wait_on_page_writeback(pages[i]);
907
908 page_start = page_offset(pages[0]);
909 page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
910
911 lock_extent_bits(&BTRFS_I(inode)->io_tree,
912 page_start, page_end - 1, 0, &cached_state,
913 GFP_NOFS);
914 ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
915 if (ordered &&
916 ordered->file_offset + ordered->len > page_start &&
917 ordered->file_offset < page_end) {
918 btrfs_put_ordered_extent(ordered);
919 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
920 page_start, page_end - 1,
921 &cached_state, GFP_NOFS);
922 for (i = 0; i < i_done; i++) {
923 unlock_page(pages[i]);
924 page_cache_release(pages[i]);
925 }
926 btrfs_wait_ordered_range(inode, page_start,
927 page_end - page_start);
928 goto again;
929 }
930 if (ordered)
931 btrfs_put_ordered_extent(ordered);
932
933 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
934 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
935 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
936 GFP_NOFS);
937
938 if (i_done != num_pages) {
939 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
940 btrfs_delalloc_release_space(inode,
941 (num_pages - i_done) << PAGE_CACHE_SHIFT);
942 }
943
944
945 btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
946 &cached_state);
947
948 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
949 page_start, page_end - 1, &cached_state,
950 GFP_NOFS);
951
952 for (i = 0; i < i_done; i++) {
953 clear_page_dirty_for_io(pages[i]);
954 ClearPageChecked(pages[i]);
955 set_page_extent_mapped(pages[i]);
956 set_page_dirty(pages[i]);
957 unlock_page(pages[i]);
958 page_cache_release(pages[i]);
959 }
960 return i_done;
961out:
962 for (i = 0; i < i_done; i++) {
963 unlock_page(pages[i]);
964 page_cache_release(pages[i]);
965 }
966 btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
967 return ret;
968
969}
970
971int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_ioctl_defrag_range_args *range,
973 u64 newer_than, unsigned long max_to_defrag)
974{
975 struct btrfs_root *root = BTRFS_I(inode)->root;
738 struct btrfs_super_block *disk_super; 976 struct btrfs_super_block *disk_super;
977 struct file_ra_state *ra = NULL;
739 unsigned long last_index; 978 unsigned long last_index;
740 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
741 unsigned long total_read = 0;
742 u64 features; 979 u64 features;
743 u64 page_start;
744 u64 page_end;
745 u64 last_len = 0; 980 u64 last_len = 0;
746 u64 skip = 0; 981 u64 skip = 0;
747 u64 defrag_end = 0; 982 u64 defrag_end = 0;
983 u64 newer_off = range->start;
984 int newer_left = 0;
748 unsigned long i; 985 unsigned long i;
749 int ret; 986 int ret;
987 int defrag_count = 0;
750 int compress_type = BTRFS_COMPRESS_ZLIB; 988 int compress_type = BTRFS_COMPRESS_ZLIB;
989 int extent_thresh = range->extent_thresh;
990 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
991 u64 new_align = ~((u64)128 * 1024 - 1);
992 struct page **pages = NULL;
993
994 if (extent_thresh == 0)
995 extent_thresh = 256 * 1024;
751 996
752 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 997 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
753 if (range->compress_type > BTRFS_COMPRESS_TYPES) 998 if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -759,6 +1004,27 @@ static int btrfs_defrag_file(struct file *file,
759 if (inode->i_size == 0) 1004 if (inode->i_size == 0)
760 return 0; 1005 return 0;
761 1006
1007 /*
1008 * if we were not given a file, allocate a readahead
1009 * context
1010 */
1011 if (!file) {
1012 ra = kzalloc(sizeof(*ra), GFP_NOFS);
1013 if (!ra)
1014 return -ENOMEM;
1015 file_ra_state_init(ra, inode->i_mapping);
1016 } else {
1017 ra = &file->f_ra;
1018 }
1019
1020 pages = kmalloc(sizeof(struct page *) * newer_cluster,
1021 GFP_NOFS);
1022 if (!pages) {
1023 ret = -ENOMEM;
1024 goto out_ra;
1025 }
1026
1027 /* find the last page to defrag */
762 if (range->start + range->len > range->start) { 1028 if (range->start + range->len > range->start) {
763 last_index = min_t(u64, inode->i_size - 1, 1029 last_index = min_t(u64, inode->i_size - 1,
764 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1030 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -766,11 +1032,37 @@ static int btrfs_defrag_file(struct file *file,
766 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1032 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
767 } 1033 }
768 1034
769 i = range->start >> PAGE_CACHE_SHIFT; 1035 if (newer_than) {
770 while (i <= last_index) { 1036 ret = find_new_extents(root, inode, newer_than,
771 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1037 &newer_off, 64 * 1024);
1038 if (!ret) {
1039 range->start = newer_off;
1040 /*
1041 * we always align our defrag to help keep
1042 * the extents in the file evenly spaced
1043 */
1044 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1045 newer_left = newer_cluster;
1046 } else
1047 goto out_ra;
1048 } else {
1049 i = range->start >> PAGE_CACHE_SHIFT;
1050 }
1051 if (!max_to_defrag)
1052 max_to_defrag = last_index - 1;
1053
1054 while (i <= last_index && defrag_count < max_to_defrag) {
1055 /*
1056 * make sure we stop running if someone unmounts
1057 * the FS
1058 */
1059 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1060 break;
1061
1062 if (!newer_than &&
1063 !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
772 PAGE_CACHE_SIZE, 1064 PAGE_CACHE_SIZE,
773 range->extent_thresh, 1065 extent_thresh,
774 &last_len, &skip, 1066 &last_len, &skip,
775 &defrag_end)) { 1067 &defrag_end)) {
776 unsigned long next; 1068 unsigned long next;
@@ -782,92 +1074,39 @@ static int btrfs_defrag_file(struct file *file,
782 i = max(i + 1, next); 1074 i = max(i + 1, next);
783 continue; 1075 continue;
784 } 1076 }
785
786 if (total_read % ra_pages == 0) {
787 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
788 min(last_index, i + ra_pages - 1));
789 }
790 total_read++;
791 mutex_lock(&inode->i_mutex);
792 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1077 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
793 BTRFS_I(inode)->force_compress = compress_type; 1078 BTRFS_I(inode)->force_compress = compress_type;
794 1079
795 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1080 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
796 if (ret)
797 goto err_unlock;
798again:
799 if (inode->i_size == 0 ||
800 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
801 ret = 0;
802 goto err_reservations;
803 }
804 1081
805 page = grab_cache_page(inode->i_mapping, i); 1082 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
806 if (!page) { 1083 if (ret < 0)
807 ret = -ENOMEM; 1084 goto out_ra;
808 goto err_reservations;
809 }
810
811 if (!PageUptodate(page)) {
812 btrfs_readpage(NULL, page);
813 lock_page(page);
814 if (!PageUptodate(page)) {
815 unlock_page(page);
816 page_cache_release(page);
817 ret = -EIO;
818 goto err_reservations;
819 }
820 }
821
822 if (page->mapping != inode->i_mapping) {
823 unlock_page(page);
824 page_cache_release(page);
825 goto again;
826 }
827
828 wait_on_page_writeback(page);
829 1085
830 if (PageDirty(page)) { 1086 defrag_count += ret;
831 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1087 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
832 goto loop_unlock; 1088 i += ret;
833 }
834 1089
835 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 1090 if (newer_than) {
836 page_end = page_start + PAGE_CACHE_SIZE - 1; 1091 if (newer_off == (u64)-1)
837 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 1092 break;
838 1093
839 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1094 newer_off = max(newer_off + 1,
840 if (ordered) { 1095 (u64)i << PAGE_CACHE_SHIFT);
841 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1096
842 unlock_page(page); 1097 ret = find_new_extents(root, inode,
843 page_cache_release(page); 1098 newer_than, &newer_off,
844 btrfs_start_ordered_extent(inode, ordered, 1); 1099 64 * 1024);
845 btrfs_put_ordered_extent(ordered); 1100 if (!ret) {
846 goto again; 1101 range->start = newer_off;
1102 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1103 newer_left = newer_cluster;
1104 } else {
1105 break;
1106 }
1107 } else {
1108 i++;
847 } 1109 }
848 set_page_extent_mapped(page);
849
850 /*
851 * this makes sure page_mkwrite is called on the
852 * page if it is dirtied again later
853 */
854 clear_page_dirty_for_io(page);
855 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
856 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
857 EXTENT_DO_ACCOUNTING, GFP_NOFS);
858
859 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
860 ClearPageChecked(page);
861 set_page_dirty(page);
862 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
863
864loop_unlock:
865 unlock_page(page);
866 page_cache_release(page);
867 mutex_unlock(&inode->i_mutex);
868
869 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
870 i++;
871 } 1110 }
872 1111
873 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1112 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -899,12 +1138,14 @@ loop_unlock:
899 btrfs_set_super_incompat_flags(disk_super, features); 1138 btrfs_set_super_incompat_flags(disk_super, features);
900 } 1139 }
901 1140
902 return 0; 1141 if (!file)
1142 kfree(ra);
1143 return defrag_count;
903 1144
904err_reservations: 1145out_ra:
905 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1146 if (!file)
906err_unlock: 1147 kfree(ra);
907 mutex_unlock(&inode->i_mutex); 1148 kfree(pages);
908 return ret; 1149 return ret;
909} 1150}
910 1151
@@ -1756,7 +1997,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1756 /* the rest are all set to zero by kzalloc */ 1997 /* the rest are all set to zero by kzalloc */
1757 range->len = (u64)-1; 1998 range->len = (u64)-1;
1758 } 1999 }
1759 ret = btrfs_defrag_file(file, range); 2000 ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
2001 range, 0, 0);
2002 if (ret > 0)
2003 ret = 0;
1760 kfree(range); 2004 kfree(range);
1761 break; 2005 break;
1762 default: 2006 default:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e5e0ee2cad4e..ad1ea789fcb4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -181,37 +181,6 @@ struct btrfs_ioctl_clone_range_args {
181#define BTRFS_DEFRAG_RANGE_COMPRESS 1 181#define BTRFS_DEFRAG_RANGE_COMPRESS 1
182#define BTRFS_DEFRAG_RANGE_START_IO 2 182#define BTRFS_DEFRAG_RANGE_START_IO 2
183 183
184struct btrfs_ioctl_defrag_range_args {
185 /* start of the defrag operation */
186 __u64 start;
187
188 /* number of bytes to defrag, use (u64)-1 to say all */
189 __u64 len;
190
191 /*
192 * flags for the operation, which can include turning
193 * on compression for this one defrag
194 */
195 __u64 flags;
196
197 /*
198 * any extent bigger than this will be considered
199 * already defragged. Use 0 to take the kernel default
200 * Use 1 to say every single extent must be rewritten
201 */
202 __u32 extent_thresh;
203
204 /*
205 * which compression method to use if turning on compression
206 * for this defrag operation. If unspecified, zlib will
207 * be used
208 */
209 __u32 compress_type;
210
211 /* spare for later */
212 __u32 unused[4];
213};
214
215struct btrfs_ioctl_space_info { 184struct btrfs_ioctl_space_info {
216 __u64 flags; 185 __u64 flags;
217 __u64 total_bytes; 186 __u64 total_bytes;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cd0c7cd2c8fb..28e3cb2607ff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -160,7 +160,7 @@ enum {
160 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 160 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
161 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 161 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
162 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 162 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
163 Opt_enospc_debug, Opt_subvolrootid, Opt_err, 163 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
164}; 164};
165 165
166static match_table_t tokens = { 166static match_table_t tokens = {
@@ -191,6 +191,7 @@ static match_table_t tokens = {
191 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 191 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
192 {Opt_enospc_debug, "enospc_debug"}, 192 {Opt_enospc_debug, "enospc_debug"},
193 {Opt_subvolrootid, "subvolrootid=%d"}, 193 {Opt_subvolrootid, "subvolrootid=%d"},
194 {Opt_defrag, "autodefrag"},
194 {Opt_err, NULL}, 195 {Opt_err, NULL},
195}; 196};
196 197
@@ -369,6 +370,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
369 case Opt_enospc_debug: 370 case Opt_enospc_debug:
370 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 371 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
371 break; 372 break;
373 case Opt_defrag:
374 printk(KERN_INFO "btrfs: enabling auto defrag");
375 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
376 break;
372 case Opt_err: 377 case Opt_err:
373 printk(KERN_INFO "btrfs: unrecognized mount option " 378 printk(KERN_INFO "btrfs: unrecognized mount option "
374 "'%s'\n", p); 379 "'%s'\n", p);