aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/btrfs_inode.h1
-rw-r--r--fs/btrfs/ctree.h45
-rw-r--r--fs/btrfs/disk-io.c12
-rw-r--r--fs/btrfs/file.c257
-rw-r--r--fs/btrfs/inode.c12
-rw-r--r--fs/btrfs/ioctl.c448
-rw-r--r--fs/btrfs/ioctl.h31
-rw-r--r--fs/btrfs/super.c7
8 files changed, 678 insertions, 135 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d0b0e43a6a8b..93b1aa932014 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,7 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 154 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
156 unsigned in_defrag:1;
156 157
157 /* 158 /*
158 * always compress this one file 159 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 026fc47b42cf..332323e19dd1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1074,6 +1074,11 @@ struct btrfs_fs_info {
1074 /* all metadata allocations go through this cluster */ 1074 /* all metadata allocations go through this cluster */
1075 struct btrfs_free_cluster meta_alloc_cluster; 1075 struct btrfs_free_cluster meta_alloc_cluster;
1076 1076
1077 /* auto defrag inodes go here */
1078 spinlock_t defrag_inodes_lock;
1079 struct rb_root defrag_inodes;
1080 atomic_t defrag_running;
1081
1077 spinlock_t ref_cache_lock; 1082 spinlock_t ref_cache_lock;
1078 u64 total_ref_cache_size; 1083 u64 total_ref_cache_size;
1079 1084
@@ -1205,6 +1210,38 @@ struct btrfs_root {
1205 struct super_block anon_super; 1210 struct super_block anon_super;
1206}; 1211};
1207 1212
1213struct btrfs_ioctl_defrag_range_args {
1214 /* start of the defrag operation */
1215 __u64 start;
1216
1217 /* number of bytes to defrag, use (u64)-1 to say all */
1218 __u64 len;
1219
1220 /*
1221 * flags for the operation, which can include turning
1222 * on compression for this one defrag
1223 */
1224 __u64 flags;
1225
1226 /*
1227 * any extent bigger than this will be considered
1228 * already defragged. Use 0 to take the kernel default
1229 * Use 1 to say every single extent must be rewritten
1230 */
1231 __u32 extent_thresh;
1232
1233 /*
1234 * which compression method to use if turning on compression
1235 * for this defrag operation. If unspecified, zlib will
1236 * be used
1237 */
1238 __u32 compress_type;
1239
1240 /* spare for later */
1241 __u32 unused[4];
1242};
1243
1244
1208/* 1245/*
1209 * inode items have the data typically returned from stat and store other 1246 * inode items have the data typically returned from stat and store other
1210 * info about object characteristics. There is one for every file and dir in 1247 * info about object characteristics. There is one for every file and dir in
@@ -1302,6 +1339,7 @@ struct btrfs_root {
1302#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1339#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1303#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1340#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1304#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1341#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1342#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1305 1343
1306#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1344#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1307#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1345#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2528,8 +2566,13 @@ extern const struct dentry_operations btrfs_dentry_operations;
2528long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2566long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2529void btrfs_update_iflags(struct inode *inode); 2567void btrfs_update_iflags(struct inode *inode);
2530void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2568void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2531 2569int btrfs_defrag_file(struct inode *inode, struct file *file,
2570 struct btrfs_ioctl_defrag_range_args *range,
2571 u64 newer_than, unsigned long max_pages);
2532/* file.c */ 2572/* file.c */
2573int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
2574 struct inode *inode);
2575int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
2533int btrfs_sync_file(struct file *file, int datasync); 2576int btrfs_sync_file(struct file *file, int datasync);
2534int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2577int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2535 int skip_pinned); 2578 int skip_pinned);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 16d335b342a2..b2588a552658 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1475,6 +1475,7 @@ static int cleaner_kthread(void *arg)
1475 btrfs_run_delayed_iputs(root); 1475 btrfs_run_delayed_iputs(root);
1476 btrfs_clean_old_snapshots(root); 1476 btrfs_clean_old_snapshots(root);
1477 mutex_unlock(&root->fs_info->cleaner_mutex); 1477 mutex_unlock(&root->fs_info->cleaner_mutex);
1478 btrfs_run_defrag_inodes(root->fs_info);
1478 } 1479 }
1479 1480
1480 if (freezing(current)) { 1481 if (freezing(current)) {
@@ -1616,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1616 spin_lock_init(&fs_info->ref_cache_lock); 1617 spin_lock_init(&fs_info->ref_cache_lock);
1617 spin_lock_init(&fs_info->fs_roots_radix_lock); 1618 spin_lock_init(&fs_info->fs_roots_radix_lock);
1618 spin_lock_init(&fs_info->delayed_iput_lock); 1619 spin_lock_init(&fs_info->delayed_iput_lock);
1620 spin_lock_init(&fs_info->defrag_inodes_lock);
1619 1621
1620 init_completion(&fs_info->kobj_unregister); 1622 init_completion(&fs_info->kobj_unregister);
1621 fs_info->tree_root = tree_root; 1623 fs_info->tree_root = tree_root;
@@ -1638,9 +1640,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1638 atomic_set(&fs_info->async_delalloc_pages, 0); 1640 atomic_set(&fs_info->async_delalloc_pages, 0);
1639 atomic_set(&fs_info->async_submit_draining, 0); 1641 atomic_set(&fs_info->async_submit_draining, 0);
1640 atomic_set(&fs_info->nr_async_bios, 0); 1642 atomic_set(&fs_info->nr_async_bios, 0);
1643 atomic_set(&fs_info->defrag_running, 0);
1641 fs_info->sb = sb; 1644 fs_info->sb = sb;
1642 fs_info->max_inline = 8192 * 1024; 1645 fs_info->max_inline = 8192 * 1024;
1643 fs_info->metadata_ratio = 0; 1646 fs_info->metadata_ratio = 0;
1647 fs_info->defrag_inodes = RB_ROOT;
1644 1648
1645 fs_info->thread_pool_size = min_t(unsigned long, 1649 fs_info->thread_pool_size = min_t(unsigned long,
1646 num_online_cpus() + 2, 8); 1650 num_online_cpus() + 2, 8);
@@ -2501,6 +2505,14 @@ int close_ctree(struct btrfs_root *root)
2501 smp_mb(); 2505 smp_mb();
2502 2506
2503 btrfs_scrub_cancel(root); 2507 btrfs_scrub_cancel(root);
2508
2509 /* wait for any defraggers to finish */
2510 wait_event(fs_info->transaction_wait,
2511 (atomic_read(&fs_info->defrag_running) == 0));
2512
2513 /* clear out the rbtree of defraggable inodes */
2514 btrfs_run_defrag_inodes(root->fs_info);
2515
2504 btrfs_put_block_group_cache(fs_info); 2516 btrfs_put_block_group_cache(fs_info);
2505 2517
2506 /* 2518 /*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58ddc4442159..c6a22d783c35 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,263 @@
40#include "locking.h" 40#include "locking.h"
41#include "compat.h" 41#include "compat.h"
42 42
43/*
44 * when auto defrag is enabled we
45 * queue up these defrag structs to remember which
46 * inodes need defragging passes
47 */
48struct inode_defrag {
49 struct rb_node rb_node;
50 /* objectid */
51 u64 ino;
52 /*
53 * transid where the defrag was added, we search for
54 * extents newer than this
55 */
56 u64 transid;
57
58 /* root objectid */
59 u64 root;
60
61 /* last offset we were able to defrag */
62 u64 last_offset;
63
64 /* if we've wrapped around back to zero once already */
65 int cycled;
66};
67
68/* pop a record for an inode into the defrag tree. The lock
69 * must be held already
70 *
71 * If you're inserting a record for an older transid than an
72 * existing record, the transid already in the tree is lowered
73 *
74 * If an existing record is found the defrag item you
75 * pass in is freed
76 */
77static int __btrfs_add_inode_defrag(struct inode *inode,
78 struct inode_defrag *defrag)
79{
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct inode_defrag *entry;
82 struct rb_node **p;
83 struct rb_node *parent = NULL;
84
85 p = &root->fs_info->defrag_inodes.rb_node;
86 while (*p) {
87 parent = *p;
88 entry = rb_entry(parent, struct inode_defrag, rb_node);
89
90 if (defrag->ino < entry->ino)
91 p = &parent->rb_left;
92 else if (defrag->ino > entry->ino)
93 p = &parent->rb_right;
94 else {
95 /* if we're reinserting an entry for
96 * an old defrag run, make sure to
97 * lower the transid of our existing record
98 */
99 if (defrag->transid < entry->transid)
100 entry->transid = defrag->transid;
101 if (defrag->last_offset > entry->last_offset)
102 entry->last_offset = defrag->last_offset;
103 goto exists;
104 }
105 }
106 BTRFS_I(inode)->in_defrag = 1;
107 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return 0;
110
111exists:
112 kfree(defrag);
113 return 0;
114
115}
116
117/*
118 * insert a defrag record for this inode if auto defrag is
119 * enabled
120 */
121int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
122 struct inode *inode)
123{
124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct inode_defrag *defrag;
126 int ret = 0;
127 u64 transid;
128
129 if (!btrfs_test_opt(root, AUTO_DEFRAG))
130 return 0;
131
132 if (root->fs_info->closing)
133 return 0;
134
135 if (BTRFS_I(inode)->in_defrag)
136 return 0;
137
138 if (trans)
139 transid = trans->transid;
140 else
141 transid = BTRFS_I(inode)->root->last_trans;
142
143 defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
144 if (!defrag)
145 return -ENOMEM;
146
147 defrag->ino = inode->i_ino;
148 defrag->transid = transid;
149 defrag->root = root->root_key.objectid;
150
151 spin_lock(&root->fs_info->defrag_inodes_lock);
152 if (!BTRFS_I(inode)->in_defrag)
153 ret = __btrfs_add_inode_defrag(inode, defrag);
154 spin_unlock(&root->fs_info->defrag_inodes_lock);
155 return ret;
156}
157
158/*
159 * must be called with the defrag_inodes lock held
160 */
161struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
162 struct rb_node **next)
163{
164 struct inode_defrag *entry = NULL;
165 struct rb_node *p;
166 struct rb_node *parent = NULL;
167
168 p = info->defrag_inodes.rb_node;
169 while (p) {
170 parent = p;
171 entry = rb_entry(parent, struct inode_defrag, rb_node);
172
173 if (ino < entry->ino)
174 p = parent->rb_left;
175 else if (ino > entry->ino)
176 p = parent->rb_right;
177 else
178 return entry;
179 }
180
181 if (next) {
182 while (parent && ino > entry->ino) {
183 parent = rb_next(parent);
184 entry = rb_entry(parent, struct inode_defrag, rb_node);
185 }
186 *next = parent;
187 }
188 return NULL;
189}
190
191/*
192 * run through the list of inodes in the FS that need
193 * defragging
194 */
195int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
196{
197 struct inode_defrag *defrag;
198 struct btrfs_root *inode_root;
199 struct inode *inode;
200 struct rb_node *n;
201 struct btrfs_key key;
202 struct btrfs_ioctl_defrag_range_args range;
203 u64 first_ino = 0;
204 int num_defrag;
205 int defrag_batch = 1024;
206
207 memset(&range, 0, sizeof(range));
208 range.len = (u64)-1;
209
210 atomic_inc(&fs_info->defrag_running);
211 spin_lock(&fs_info->defrag_inodes_lock);
212 while(1) {
213 n = NULL;
214
215 /* find an inode to defrag */
216 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
217 if (!defrag) {
218 if (n)
219 defrag = rb_entry(n, struct inode_defrag, rb_node);
220 else if (first_ino) {
221 first_ino = 0;
222 continue;
223 } else {
224 break;
225 }
226 }
227
228 /* remove it from the rbtree */
229 first_ino = defrag->ino + 1;
230 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
231
232 if (fs_info->closing)
233 goto next_free;
234
235 spin_unlock(&fs_info->defrag_inodes_lock);
236
237 /* get the inode */
238 key.objectid = defrag->root;
239 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
240 key.offset = (u64)-1;
241 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
242 if (IS_ERR(inode_root))
243 goto next;
244
245 key.objectid = defrag->ino;
246 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
247 key.offset = 0;
248
249 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
250 if (IS_ERR(inode))
251 goto next;
252
253 /* do a chunk of defrag */
254 BTRFS_I(inode)->in_defrag = 0;
255 range.start = defrag->last_offset;
256 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
257 defrag_batch);
258 /*
259 * if we filled the whole defrag batch, there
260 * must be more work to do. Queue this defrag
261 * again
262 */
263 if (num_defrag == defrag_batch) {
264 defrag->last_offset = range.start;
265 __btrfs_add_inode_defrag(inode, defrag);
266 /*
267 * we don't want to kfree defrag, we added it back to
268 * the rbtree
269 */
270 defrag = NULL;
271 } else if (defrag->last_offset && !defrag->cycled) {
272 /*
273 * we didn't fill our defrag batch, but
274 * we didn't start at zero. Make sure we loop
275 * around to the start of the file.
276 */
277 defrag->last_offset = 0;
278 defrag->cycled = 1;
279 __btrfs_add_inode_defrag(inode, defrag);
280 defrag = NULL;
281 }
282
283 iput(inode);
284next:
285 spin_lock(&fs_info->defrag_inodes_lock);
286next_free:
287 kfree(defrag);
288 }
289 spin_unlock(&fs_info->defrag_inodes_lock);
290
291 atomic_dec(&fs_info->defrag_running);
292
293 /*
294 * during unmount, we use the transaction_wait queue to
295 * wait for the defragger to stop
296 */
297 wake_up(&fs_info->transaction_wait);
298 return 0;
299}
43 300
44/* simple helper to fault in pages and copy. This should go away 301/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code. 302 * and be replaced with calls into generic code.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d378f8b70ef7..bb51bb1fa44f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -342,6 +342,10 @@ static noinline int compress_file_range(struct inode *inode,
342 int will_compress; 342 int will_compress;
343 int compress_type = root->fs_info->compress_type; 343 int compress_type = root->fs_info->compress_type;
344 344
345 /* if this is a small write inside eof, kick off a defragbot */
346 if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
347 btrfs_add_inode_defrag(NULL, inode);
348
345 actual_end = min_t(u64, isize, end + 1); 349 actual_end = min_t(u64, isize, end + 1);
346again: 350again:
347 will_compress = 0; 351 will_compress = 0;
@@ -799,6 +803,10 @@ static noinline int cow_file_range(struct inode *inode,
799 disk_num_bytes = num_bytes; 803 disk_num_bytes = num_bytes;
800 ret = 0; 804 ret = 0;
801 805
806 /* if this is a small write inside eof, kick off defrag */
807 if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
808 btrfs_add_inode_defrag(trans, inode);
809
802 if (start == 0) { 810 if (start == 0) {
803 /* lets try to make an inline extent */ 811 /* lets try to make an inline extent */
804 ret = cow_file_range_inline(trans, root, inode, 812 ret = cow_file_range_inline(trans, root, inode,
@@ -5371,6 +5379,9 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5371 if (IS_ERR(trans)) 5379 if (IS_ERR(trans))
5372 return ERR_CAST(trans); 5380 return ERR_CAST(trans);
5373 5381
5382 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5383 btrfs_add_inode_defrag(trans, inode);
5384
5374 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5385 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5375 5386
5376 alloc_hint = get_extent_allocation_hint(inode, start, len); 5387 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -6682,6 +6693,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6682 ei->ordered_data_close = 0; 6693 ei->ordered_data_close = 0;
6683 ei->orphan_meta_reserved = 0; 6694 ei->orphan_meta_reserved = 0;
6684 ei->dummy_inode = 0; 6695 ei->dummy_inode = 0;
6696 ei->in_defrag = 0;
6685 ei->force_compress = BTRFS_COMPRESS_NONE; 6697 ei->force_compress = BTRFS_COMPRESS_NONE;
6686 6698
6687 ei->delayed_node = NULL; 6699 ei->delayed_node = NULL;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c4f17e4e2c9c..85e818ce00c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -656,6 +656,106 @@ out_unlock:
656 return error; 656 return error;
657} 657}
658 658
659/*
660 * When we're defragging a range, we don't want to kick it off again
661 * if it is really just waiting for delalloc to send it down.
662 * If we find a nice big extent or delalloc range for the bytes in the
663 * file you want to defrag, we return 0 to let you know to skip this
664 * part of the file
665 */
666static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
667{
668 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
669 struct extent_map *em = NULL;
670 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
671 u64 end;
672
673 read_lock(&em_tree->lock);
674 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
675 read_unlock(&em_tree->lock);
676
677 if (em) {
678 end = extent_map_end(em);
679 free_extent_map(em);
680 if (end - offset > thresh)
681 return 0;
682 }
683 /* if we already have a nice delalloc here, just stop */
684 thresh /= 2;
685 end = count_range_bits(io_tree, &offset, offset + thresh,
686 thresh, EXTENT_DELALLOC, 1);
687 if (end >= thresh)
688 return 0;
689 return 1;
690}
691
692/*
693 * helper function to walk through a file and find extents
694 * newer than a specific transid, and smaller than thresh.
695 *
696 * This is used by the defragging code to find new and small
697 * extents
698 */
699static int find_new_extents(struct btrfs_root *root,
700 struct inode *inode, u64 newer_than,
701 u64 *off, int thresh)
702{
703 struct btrfs_path *path;
704 struct btrfs_key min_key;
705 struct btrfs_key max_key;
706 struct extent_buffer *leaf;
707 struct btrfs_file_extent_item *extent;
708 int type;
709 int ret;
710
711 path = btrfs_alloc_path();
712 if (!path)
713 return -ENOMEM;
714
715 min_key.objectid = inode->i_ino;
716 min_key.type = BTRFS_EXTENT_DATA_KEY;
717 min_key.offset = *off;
718
719 max_key.objectid = inode->i_ino;
720 max_key.type = (u8)-1;
721 max_key.offset = (u64)-1;
722
723 path->keep_locks = 1;
724
725 while(1) {
726 ret = btrfs_search_forward(root, &min_key, &max_key,
727 path, 0, newer_than);
728 if (ret != 0)
729 goto none;
730 if (min_key.objectid != inode->i_ino)
731 goto none;
732 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
733 goto none;
734
735 leaf = path->nodes[0];
736 extent = btrfs_item_ptr(leaf, path->slots[0],
737 struct btrfs_file_extent_item);
738
739 type = btrfs_file_extent_type(leaf, extent);
740 if (type == BTRFS_FILE_EXTENT_REG &&
741 btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
742 check_defrag_in_cache(inode, min_key.offset, thresh)) {
743 *off = min_key.offset;
744 btrfs_free_path(path);
745 return 0;
746 }
747
748 if (min_key.offset == (u64)-1)
749 goto none;
750
751 min_key.offset++;
752 btrfs_release_path(path);
753 }
754none:
755 btrfs_free_path(path);
756 return -ENOENT;
757}
758
659static int should_defrag_range(struct inode *inode, u64 start, u64 len, 759static int should_defrag_range(struct inode *inode, u64 start, u64 len,
660 int thresh, u64 *last_len, u64 *skip, 760 int thresh, u64 *last_len, u64 *skip,
661 u64 *defrag_end) 761 u64 *defrag_end)
@@ -665,10 +765,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
665 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 765 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
666 int ret = 1; 766 int ret = 1;
667 767
668
669 if (thresh == 0)
670 thresh = 256 * 1024;
671
672 /* 768 /*
673 * make sure that once we start defragging and extent, we keep on 769 * make sure that once we start defragging and extent, we keep on
674 * defragging it 770 * defragging it
@@ -727,27 +823,176 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
727 return ret; 823 return ret;
728} 824}
729 825
730static int btrfs_defrag_file(struct file *file, 826/*
731 struct btrfs_ioctl_defrag_range_args *range) 827 * it doesn't do much good to defrag one or two pages
828 * at a time. This pulls in a nice chunk of pages
829 * to COW and defrag.
830 *
831 * It also makes sure the delalloc code has enough
832 * dirty data to avoid making new small extents as part
833 * of the defrag
834 *
835 * It's a good idea to start RA on this range
836 * before calling this.
837 */
838static int cluster_pages_for_defrag(struct inode *inode,
839 struct page **pages,
840 unsigned long start_index,
841 int num_pages)
732{ 842{
733 struct inode *inode = fdentry(file)->d_inode; 843 unsigned long file_end;
734 struct btrfs_root *root = BTRFS_I(inode)->root; 844 u64 isize = i_size_read(inode);
735 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 845 u64 page_start;
846 u64 page_end;
847 int ret;
848 int i;
849 int i_done;
736 struct btrfs_ordered_extent *ordered; 850 struct btrfs_ordered_extent *ordered;
737 struct page *page; 851 struct extent_state *cached_state = NULL;
852
853 if (isize == 0)
854 return 0;
855 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
856
857 ret = btrfs_delalloc_reserve_space(inode,
858 num_pages << PAGE_CACHE_SHIFT);
859 if (ret)
860 return ret;
861again:
862 ret = 0;
863 i_done = 0;
864
865 /* step one, lock all the pages */
866 for (i = 0; i < num_pages; i++) {
867 struct page *page;
868 page = grab_cache_page(inode->i_mapping,
869 start_index + i);
870 if (!page)
871 break;
872
873 if (!PageUptodate(page)) {
874 btrfs_readpage(NULL, page);
875 lock_page(page);
876 if (!PageUptodate(page)) {
877 unlock_page(page);
878 page_cache_release(page);
879 ret = -EIO;
880 break;
881 }
882 }
883 isize = i_size_read(inode);
884 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
885 if (!isize || page->index > file_end ||
886 page->mapping != inode->i_mapping) {
887 /* whoops, we blew past eof, skip this page */
888 unlock_page(page);
889 page_cache_release(page);
890 break;
891 }
892 pages[i] = page;
893 i_done++;
894 }
895 if (!i_done || ret)
896 goto out;
897
898 if (!(inode->i_sb->s_flags & MS_ACTIVE))
899 goto out;
900
901 /*
902 * so now we have a nice long stream of locked
903 * and up to date pages, lets wait on them
904 */
905 for (i = 0; i < i_done; i++)
906 wait_on_page_writeback(pages[i]);
907
908 page_start = page_offset(pages[0]);
909 page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
910
911 lock_extent_bits(&BTRFS_I(inode)->io_tree,
912 page_start, page_end - 1, 0, &cached_state,
913 GFP_NOFS);
914 ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
915 if (ordered &&
916 ordered->file_offset + ordered->len > page_start &&
917 ordered->file_offset < page_end) {
918 btrfs_put_ordered_extent(ordered);
919 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
920 page_start, page_end - 1,
921 &cached_state, GFP_NOFS);
922 for (i = 0; i < i_done; i++) {
923 unlock_page(pages[i]);
924 page_cache_release(pages[i]);
925 }
926 btrfs_wait_ordered_range(inode, page_start,
927 page_end - page_start);
928 goto again;
929 }
930 if (ordered)
931 btrfs_put_ordered_extent(ordered);
932
933 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
934 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
935 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
936 GFP_NOFS);
937
938 if (i_done != num_pages) {
939 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
940 btrfs_delalloc_release_space(inode,
941 (num_pages - i_done) << PAGE_CACHE_SHIFT);
942 }
943
944
945 btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
946 &cached_state);
947
948 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
949 page_start, page_end - 1, &cached_state,
950 GFP_NOFS);
951
952 for (i = 0; i < i_done; i++) {
953 clear_page_dirty_for_io(pages[i]);
954 ClearPageChecked(pages[i]);
955 set_page_extent_mapped(pages[i]);
956 set_page_dirty(pages[i]);
957 unlock_page(pages[i]);
958 page_cache_release(pages[i]);
959 }
960 return i_done;
961out:
962 for (i = 0; i < i_done; i++) {
963 unlock_page(pages[i]);
964 page_cache_release(pages[i]);
965 }
966 btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
967 return ret;
968
969}
970
971int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_ioctl_defrag_range_args *range,
973 u64 newer_than, unsigned long max_to_defrag)
974{
975 struct btrfs_root *root = BTRFS_I(inode)->root;
738 struct btrfs_super_block *disk_super; 976 struct btrfs_super_block *disk_super;
977 struct file_ra_state *ra = NULL;
739 unsigned long last_index; 978 unsigned long last_index;
740 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
741 unsigned long total_read = 0;
742 u64 features; 979 u64 features;
743 u64 page_start;
744 u64 page_end;
745 u64 last_len = 0; 980 u64 last_len = 0;
746 u64 skip = 0; 981 u64 skip = 0;
747 u64 defrag_end = 0; 982 u64 defrag_end = 0;
983 u64 newer_off = range->start;
984 int newer_left = 0;
748 unsigned long i; 985 unsigned long i;
749 int ret; 986 int ret;
987 int defrag_count = 0;
750 int compress_type = BTRFS_COMPRESS_ZLIB; 988 int compress_type = BTRFS_COMPRESS_ZLIB;
989 int extent_thresh = range->extent_thresh;
990 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
991 u64 new_align = ~((u64)128 * 1024 - 1);
992 struct page **pages = NULL;
993
994 if (extent_thresh == 0)
995 extent_thresh = 256 * 1024;
751 996
752 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 997 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
753 if (range->compress_type > BTRFS_COMPRESS_TYPES) 998 if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -759,6 +1004,27 @@ static int btrfs_defrag_file(struct file *file,
759 if (inode->i_size == 0) 1004 if (inode->i_size == 0)
760 return 0; 1005 return 0;
761 1006
1007 /*
1008 * if we were not given a file, allocate a readahead
1009 * context
1010 */
1011 if (!file) {
1012 ra = kzalloc(sizeof(*ra), GFP_NOFS);
1013 if (!ra)
1014 return -ENOMEM;
1015 file_ra_state_init(ra, inode->i_mapping);
1016 } else {
1017 ra = &file->f_ra;
1018 }
1019
1020 pages = kmalloc(sizeof(struct page *) * newer_cluster,
1021 GFP_NOFS);
1022 if (!pages) {
1023 ret = -ENOMEM;
1024 goto out_ra;
1025 }
1026
1027 /* find the last page to defrag */
762 if (range->start + range->len > range->start) { 1028 if (range->start + range->len > range->start) {
763 last_index = min_t(u64, inode->i_size - 1, 1029 last_index = min_t(u64, inode->i_size - 1,
764 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1030 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -766,11 +1032,37 @@ static int btrfs_defrag_file(struct file *file,
766 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1032 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
767 } 1033 }
768 1034
769 i = range->start >> PAGE_CACHE_SHIFT; 1035 if (newer_than) {
770 while (i <= last_index) { 1036 ret = find_new_extents(root, inode, newer_than,
771 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1037 &newer_off, 64 * 1024);
1038 if (!ret) {
1039 range->start = newer_off;
1040 /*
1041 * we always align our defrag to help keep
1042 * the extents in the file evenly spaced
1043 */
1044 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1045 newer_left = newer_cluster;
1046 } else
1047 goto out_ra;
1048 } else {
1049 i = range->start >> PAGE_CACHE_SHIFT;
1050 }
1051 if (!max_to_defrag)
1052 max_to_defrag = last_index - 1;
1053
1054 while (i <= last_index && defrag_count < max_to_defrag) {
1055 /*
1056 * make sure we stop running if someone unmounts
1057 * the FS
1058 */
1059 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1060 break;
1061
1062 if (!newer_than &&
1063 !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
772 PAGE_CACHE_SIZE, 1064 PAGE_CACHE_SIZE,
773 range->extent_thresh, 1065 extent_thresh,
774 &last_len, &skip, 1066 &last_len, &skip,
775 &defrag_end)) { 1067 &defrag_end)) {
776 unsigned long next; 1068 unsigned long next;
@@ -782,92 +1074,39 @@ static int btrfs_defrag_file(struct file *file,
782 i = max(i + 1, next); 1074 i = max(i + 1, next);
783 continue; 1075 continue;
784 } 1076 }
785
786 if (total_read % ra_pages == 0) {
787 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
788 min(last_index, i + ra_pages - 1));
789 }
790 total_read++;
791 mutex_lock(&inode->i_mutex);
792 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1077 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
793 BTRFS_I(inode)->force_compress = compress_type; 1078 BTRFS_I(inode)->force_compress = compress_type;
794 1079
795 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1080 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
796 if (ret)
797 goto err_unlock;
798again:
799 if (inode->i_size == 0 ||
800 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
801 ret = 0;
802 goto err_reservations;
803 }
804 1081
805 page = grab_cache_page(inode->i_mapping, i); 1082 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
806 if (!page) { 1083 if (ret < 0)
807 ret = -ENOMEM; 1084 goto out_ra;
808 goto err_reservations;
809 }
810
811 if (!PageUptodate(page)) {
812 btrfs_readpage(NULL, page);
813 lock_page(page);
814 if (!PageUptodate(page)) {
815 unlock_page(page);
816 page_cache_release(page);
817 ret = -EIO;
818 goto err_reservations;
819 }
820 }
821
822 if (page->mapping != inode->i_mapping) {
823 unlock_page(page);
824 page_cache_release(page);
825 goto again;
826 }
827
828 wait_on_page_writeback(page);
829 1085
830 if (PageDirty(page)) { 1086 defrag_count += ret;
831 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1087 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
832 goto loop_unlock; 1088 i += ret;
833 }
834 1089
835 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 1090 if (newer_than) {
836 page_end = page_start + PAGE_CACHE_SIZE - 1; 1091 if (newer_off == (u64)-1)
837 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 1092 break;
838 1093
839 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1094 newer_off = max(newer_off + 1,
840 if (ordered) { 1095 (u64)i << PAGE_CACHE_SHIFT);
841 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1096
842 unlock_page(page); 1097 ret = find_new_extents(root, inode,
843 page_cache_release(page); 1098 newer_than, &newer_off,
844 btrfs_start_ordered_extent(inode, ordered, 1); 1099 64 * 1024);
845 btrfs_put_ordered_extent(ordered); 1100 if (!ret) {
846 goto again; 1101 range->start = newer_off;
1102 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1103 newer_left = newer_cluster;
1104 } else {
1105 break;
1106 }
1107 } else {
1108 i++;
847 } 1109 }
848 set_page_extent_mapped(page);
849
850 /*
851 * this makes sure page_mkwrite is called on the
852 * page if it is dirtied again later
853 */
854 clear_page_dirty_for_io(page);
855 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
856 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
857 EXTENT_DO_ACCOUNTING, GFP_NOFS);
858
859 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
860 ClearPageChecked(page);
861 set_page_dirty(page);
862 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
863
864loop_unlock:
865 unlock_page(page);
866 page_cache_release(page);
867 mutex_unlock(&inode->i_mutex);
868
869 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
870 i++;
871 } 1110 }
872 1111
873 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1112 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -899,12 +1138,14 @@ loop_unlock:
899 btrfs_set_super_incompat_flags(disk_super, features); 1138 btrfs_set_super_incompat_flags(disk_super, features);
900 } 1139 }
901 1140
902 return 0; 1141 if (!file)
1142 kfree(ra);
1143 return defrag_count;
903 1144
904err_reservations: 1145out_ra:
905 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1146 if (!file)
906err_unlock: 1147 kfree(ra);
907 mutex_unlock(&inode->i_mutex); 1148 kfree(pages);
908 return ret; 1149 return ret;
909} 1150}
910 1151
@@ -1756,7 +1997,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1756 /* the rest are all set to zero by kzalloc */ 1997 /* the rest are all set to zero by kzalloc */
1757 range->len = (u64)-1; 1998 range->len = (u64)-1;
1758 } 1999 }
1759 ret = btrfs_defrag_file(file, range); 2000 ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
2001 range, 0, 0);
2002 if (ret > 0)
2003 ret = 0;
1760 kfree(range); 2004 kfree(range);
1761 break; 2005 break;
1762 default: 2006 default:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e5e0ee2cad4e..ad1ea789fcb4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -181,37 +181,6 @@ struct btrfs_ioctl_clone_range_args {
181#define BTRFS_DEFRAG_RANGE_COMPRESS 1 181#define BTRFS_DEFRAG_RANGE_COMPRESS 1
182#define BTRFS_DEFRAG_RANGE_START_IO 2 182#define BTRFS_DEFRAG_RANGE_START_IO 2
183 183
184struct btrfs_ioctl_defrag_range_args {
185 /* start of the defrag operation */
186 __u64 start;
187
188 /* number of bytes to defrag, use (u64)-1 to say all */
189 __u64 len;
190
191 /*
192 * flags for the operation, which can include turning
193 * on compression for this one defrag
194 */
195 __u64 flags;
196
197 /*
198 * any extent bigger than this will be considered
199 * already defragged. Use 0 to take the kernel default
200 * Use 1 to say every single extent must be rewritten
201 */
202 __u32 extent_thresh;
203
204 /*
205 * which compression method to use if turning on compression
206 * for this defrag operation. If unspecified, zlib will
207 * be used
208 */
209 __u32 compress_type;
210
211 /* spare for later */
212 __u32 unused[4];
213};
214
215struct btrfs_ioctl_space_info { 184struct btrfs_ioctl_space_info {
216 __u64 flags; 185 __u64 flags;
217 __u64 total_bytes; 186 __u64 total_bytes;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cd0c7cd2c8fb..28e3cb2607ff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -160,7 +160,7 @@ enum {
160 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 160 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
161 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 161 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
162 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 162 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
163 Opt_enospc_debug, Opt_subvolrootid, Opt_err, 163 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
164}; 164};
165 165
166static match_table_t tokens = { 166static match_table_t tokens = {
@@ -191,6 +191,7 @@ static match_table_t tokens = {
191 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 191 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
192 {Opt_enospc_debug, "enospc_debug"}, 192 {Opt_enospc_debug, "enospc_debug"},
193 {Opt_subvolrootid, "subvolrootid=%d"}, 193 {Opt_subvolrootid, "subvolrootid=%d"},
194 {Opt_defrag, "autodefrag"},
194 {Opt_err, NULL}, 195 {Opt_err, NULL},
195}; 196};
196 197
@@ -369,6 +370,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
369 case Opt_enospc_debug: 370 case Opt_enospc_debug:
370 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 371 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
371 break; 372 break;
373 case Opt_defrag:
374 printk(KERN_INFO "btrfs: enabling auto defrag");
375 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
376 break;
372 case Opt_err: 377 case Opt_err:
373 printk(KERN_INFO "btrfs: unrecognized mount option " 378 printk(KERN_INFO "btrfs: unrecognized mount option "
374 "'%s'\n", p); 379 "'%s'\n", p);