aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 12:42:05 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 12:42:05 -0500
commita22180d2666c018f4fef6818074d78bb76ff2bda (patch)
treea633aaf423ff39f94d00502d03dbbd99dab4b2ee /fs/btrfs/file.c
parent2d4dce0070448bcb5ccd04553a4be4635417f565 (diff)
parent213490b301773ea9c6fb89a86424a6901fcdd069 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "A big set of fixes and features. In terms of line count, most of the code comes from Stefan, who added the ability to replace a single drive in place. This is different from how btrfs normally replaces drives, and is much much much faster. Josef is plowing through our synchronous write performance. This pull request does not include the DIO_OWN_WAITING patch that was discussed on the list, but it has a number of other improvements to cut down our latencies and CPU time during fsync/O_DIRECT writes. Miao Xie has a big series of fixes and is spreading out ordered operations over more CPUs. This improves performance and reduces contention. I've put in fixes for error handling around hash collisions. These are going back to individual stable kernels as I test against them. Otherwise we have a lot of fixes and cleanups, thanks everyone! raid5/6 is being rebased against the device replacement code. I'll have it posted this Friday along with a nice series of benchmarks." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (115 commits) Btrfs: fix a bug of per-file nocow Btrfs: fix hash overflow handling Btrfs: don't take inode delalloc mutex if we're a free space inode Btrfs: fix autodefrag and umount lockup Btrfs: fix permissions of empty files not affected by umask Btrfs: put raid properties into global table Btrfs: fix BUG() in scrub when first superblock reading gives EIO Btrfs: do not call file_update_time in aio_write Btrfs: only unlock and relock if we have to Btrfs: use tokens where we can in the tree log Btrfs: optimize leaf_space_used Btrfs: don't memset new tokens Btrfs: only clear dirty on the buffer if it is marked as dirty Btrfs: move checks in set_page_dirty under DEBUG Btrfs: log changed inodes based on the extent map tree Btrfs: add path->really_keep_locks Btrfs: do not mark ems as prealloc if we are writing to them Btrfs: keep track of the extents original block length Btrfs: inline csums if we're fsyncing Btrfs: don't bother copying if we're only logging the inode ...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c406
1 files changed, 264 insertions, 142 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c6673a9231f..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h" 42#include "volumes.h"
43 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
44/* 45/*
45 * when auto defrag is enabled we 46 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
90 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
91 * pass in is freed 92 * pass in is freed
92 */ 93 */
93static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
95{ 96{
96 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
118 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
121 goto exists; 122 return -EEXIST;
122 } 123 }
123 } 124 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 128 return 0;
129}
128 130
129exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
130 kfree(defrag); 132{
131 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
132 138
139 return 1;
133} 140}
134 141
135/* 142/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
142 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
144 u64 transid; 151 u64 transid;
152 int ret;
145 153
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
147 return 0;
148
149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 155 return 0;
151 156
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
157 else 162 else
158 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
159 164
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
161 if (!defrag) 166 if (!defrag)
162 return -ENOMEM; 167 return -ENOMEM;
163 168
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
166 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
167 172
168 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
170 __btrfs_add_inode_defrag(inode, defrag); 175 /*
171 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
172 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 187 return 0;
175} 188}
176 189
177/* 190/*
178 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
179 */ 194 */
180struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 195void btrfs_requeue_inode_defrag(struct inode *inode,
181 u64 root, u64 ino, 196 struct inode_defrag *defrag)
182 struct rb_node **next) 197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
221 */
222static struct inode_defrag *
223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
183{ 224{
184 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
190 tmp.ino = ino; 231 tmp.ino = ino;
191 tmp.root = root; 232 tmp.root = root;
192 233
193 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
194 while (p) { 236 while (p) {
195 parent = p; 237 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
201 else if (ret > 0) 243 else if (ret > 0)
202 p = parent->rb_right; 244 p = parent->rb_right;
203 else 245 else
204 return entry; 246 goto out;
205 } 247 }
206 248
207 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
209 parent = rb_next(parent); 251 if (parent)
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 253 else
212 *next = parent; 254 entry = NULL;
213 } 255 }
214 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
215} 261}
216 262
217/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
218 * run through the list of inodes in the FS that need
219 * defragging
220 */
221int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222{ 264{
223 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
224 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
225 struct inode *inode; 292 struct inode *inode;
226 struct rb_node *n;
227 struct btrfs_key key; 293 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0;
230 u64 root_objectid = 0;
231 int num_defrag; 295 int num_defrag;
232 int defrag_batch = 1024;
233 296
297 /* get the inode */
298 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1;
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
304 return PTR_ERR(inode_root);
305 }
306
307 key.objectid = defrag->ino;
308 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
309 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
313 return PTR_ERR(inode);
314 }
315
316 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
234 memset(&range, 0, sizeof(range)); 318 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 319 range.len = (u64)-1;
320 range.start = defrag->last_offset;
321
322 sb_start_write(fs_info->sb);
323 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
324 BTRFS_DEFRAG_BATCH);
325 sb_end_write(fs_info->sb);
326 /*
327 * if we filled the whole defrag batch, there
328 * must be more work to do. Queue this defrag
329 * again
330 */
331 if (num_defrag == BTRFS_DEFRAG_BATCH) {
332 defrag->last_offset = range.start;
333 btrfs_requeue_inode_defrag(inode, defrag);
334 } else if (defrag->last_offset && !defrag->cycled) {
335 /*
336 * we didn't fill our defrag batch, but
337 * we didn't start at zero. Make sure we loop
338 * around to the start of the file.
339 */
340 defrag->last_offset = 0;
341 defrag->cycled = 1;
342 btrfs_requeue_inode_defrag(inode, defrag);
343 } else {
344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
345 }
346
347 iput(inode);
348 return 0;
349}
350
351/*
352 * run through the list of inodes in the FS that need
353 * defragging
354 */
355int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
356{
357 struct inode_defrag *defrag;
358 u64 first_ino = 0;
359 u64 root_objectid = 0;
236 360
237 atomic_inc(&fs_info->defrag_running); 361 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 362 while(1) {
240 n = NULL; 363 if (!__need_auto_defrag(fs_info->tree_root))
364 break;
241 365
242 /* find an inode to defrag */ 366 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 367 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 368 first_ino);
245 if (!defrag) { 369 if (!defrag) {
246 if (n) { 370 if (root_objectid || first_ino) {
247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node);
249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 371 root_objectid = 0;
251 first_ino = 0; 372 first_ino = 0;
252 continue; 373 continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
255 } 376 }
256 } 377 }
257 378
258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 379 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 380 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262
263 if (btrfs_fs_closing(fs_info))
264 goto next_free;
265
266 spin_unlock(&fs_info->defrag_inodes_lock);
267
268 /* get the inode */
269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root))
274 goto next;
275
276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0;
279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode))
282 goto next;
283 381
284 /* do a chunk of defrag */ 382 __btrfs_run_defrag_inode(fs_info, defrag);
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch);
289 /*
290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag
292 * again
293 */
294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag);
297 /*
298 * we don't want to kfree defrag, we added it back to
299 * the rbtree
300 */
301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) {
303 /*
304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file.
307 */
308 defrag->last_offset = 0;
309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL;
312 }
313
314 iput(inode);
315next:
316 spin_lock(&fs_info->defrag_inodes_lock);
317next_free:
318 kfree(defrag);
319 } 383 }
320 spin_unlock(&fs_info->defrag_inodes_lock);
321
322 atomic_dec(&fs_info->defrag_running); 384 atomic_dec(&fs_info->defrag_running);
323 385
324 /* 386 /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
526 split->block_len = em->block_len; 588 split->block_len = em->block_len;
527 else 589 else
528 split->block_len = split->len; 590 split->block_len = split->len;
591 split->orig_block_len = max(split->block_len,
592 em->orig_block_len);
529 split->generation = gen; 593 split->generation = gen;
530 split->bdev = em->bdev; 594 split->bdev = em->bdev;
531 split->flags = flags; 595 split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
547 split->flags = flags; 611 split->flags = flags;
548 split->compress_type = em->compress_type; 612 split->compress_type = em->compress_type;
549 split->generation = gen; 613 split->generation = gen;
614 split->orig_block_len = max(em->block_len,
615 em->orig_block_len);
550 616
551 if (compressed) { 617 if (compressed) {
552 split->block_len = em->block_len; 618 split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
555 } else { 621 } else {
556 split->block_len = split->len; 622 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 623 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 624 split->orig_start = em->orig_start;
559 } 625 }
560 626
561 ret = add_extent_mapping(em_tree, split); 627 ret = add_extent_mapping(em_tree, split);
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 1414
1349 balance_dirty_pages_ratelimited(inode->i_mapping); 1415 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1416 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1351 btrfs_btree_balance_dirty(root, 1); 1417 btrfs_btree_balance_dirty(root);
1352 1418
1353 pos += copied; 1419 pos += copied;
1354 num_written += copied; 1420 num_written += copied;
@@ -1397,6 +1463,24 @@ out:
1397 return written ? written : err; 1463 return written ? written : err;
1398} 1464}
1399 1465
1466static void update_time_for_write(struct inode *inode)
1467{
1468 struct timespec now;
1469
1470 if (IS_NOCMTIME(inode))
1471 return;
1472
1473 now = current_fs_time(inode->i_sb);
1474 if (!timespec_equal(&inode->i_mtime, &now))
1475 inode->i_mtime = now;
1476
1477 if (!timespec_equal(&inode->i_ctime, &now))
1478 inode->i_ctime = now;
1479
1480 if (IS_I_VERSION(inode))
1481 inode_inc_iversion(inode);
1482}
1483
1400static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1484static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1401 const struct iovec *iov, 1485 const struct iovec *iov,
1402 unsigned long nr_segs, loff_t pos) 1486 unsigned long nr_segs, loff_t pos)
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1409 ssize_t num_written = 0; 1493 ssize_t num_written = 0;
1410 ssize_t err = 0; 1494 ssize_t err = 0;
1411 size_t count, ocount; 1495 size_t count, ocount;
1496 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1412 1497
1413 sb_start_write(inode->i_sb); 1498 sb_start_write(inode->i_sb);
1414 1499
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1451 goto out; 1536 goto out;
1452 } 1537 }
1453 1538
1454 err = file_update_time(file); 1539 /*
1455 if (err) { 1540 * We reserve space for updating the inode when we reserve space for the
1456 mutex_unlock(&inode->i_mutex); 1541 * extent we are going to write, so we will enospc out there. We don't
1457 goto out; 1542 * need to start yet another transaction to update the inode as we will
1458 } 1543 * update the inode when we finish writing whatever data we write.
1544 */
1545 update_time_for_write(inode);
1459 1546
1460 start_pos = round_down(pos, root->sectorsize); 1547 start_pos = round_down(pos, root->sectorsize);
1461 if (start_pos > i_size_read(inode)) { 1548 if (start_pos > i_size_read(inode)) {
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1466 } 1553 }
1467 } 1554 }
1468 1555
1556 if (sync)
1557 atomic_inc(&BTRFS_I(inode)->sync_writers);
1558
1469 if (unlikely(file->f_flags & O_DIRECT)) { 1559 if (unlikely(file->f_flags & O_DIRECT)) {
1470 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1560 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1471 pos, ppos, count, ocount); 1561 pos, ppos, count, ocount);
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1492 * this will either be one more than the running transaction 1582 * this will either be one more than the running transaction
1493 * or the generation used for the next transaction if there isn't 1583 * or the generation used for the next transaction if there isn't
1494 * one running right now. 1584 * one running right now.
1585 *
1586 * We also have to set last_sub_trans to the current log transid,
1587 * otherwise subsequent syncs to a file that's been synced in this
1588 * transaction will appear to have already occured.
1495 */ 1589 */
1496 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1590 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1591 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1497 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1592 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1498 err = generic_write_sync(file, pos, num_written); 1593 err = generic_write_sync(file, pos, num_written);
1499 if (err < 0 && num_written > 0) 1594 if (err < 0 && num_written > 0)
1500 num_written = err; 1595 num_written = err;
1501 } 1596 }
1502out: 1597out:
1598 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers);
1503 sb_end_write(inode->i_sb); 1600 sb_end_write(inode->i_sb);
1504 current->backing_dev_info = NULL; 1601 current->backing_dev_info = NULL;
1505 return num_written ? num_written : err; 1602 return num_written ? num_written : err;
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1550 * out of the ->i_mutex. If so, we can flush the dirty pages by 1647 * out of the ->i_mutex. If so, we can flush the dirty pages by
1551 * multi-task, and make the performance up. 1648 * multi-task, and make the performance up.
1552 */ 1649 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers);
1553 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers);
1554 if (ret) 1653 if (ret)
1555 return ret; 1654 return ret;
1556 1655
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1561 * range being left. 1660 * range being left.
1562 */ 1661 */
1563 atomic_inc(&root->log_batch); 1662 atomic_inc(&root->log_batch);
1564 btrfs_wait_ordered_range(inode, start, end); 1663 btrfs_wait_ordered_range(inode, start, end - start + 1);
1565 atomic_inc(&root->log_batch); 1664 atomic_inc(&root->log_batch);
1566 1665
1567 /* 1666 /*
@@ -1767,6 +1866,7 @@ out:
1767 1866
1768 hole_em->block_start = EXTENT_MAP_HOLE; 1867 hole_em->block_start = EXTENT_MAP_HOLE;
1769 hole_em->block_len = 0; 1868 hole_em->block_len = 0;
1869 hole_em->orig_block_len = 0;
1770 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1771 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1772 hole_em->generation = trans->transid; 1872 hole_em->generation = trans->transid;
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1796 struct btrfs_path *path; 1896 struct btrfs_path *path;
1797 struct btrfs_block_rsv *rsv; 1897 struct btrfs_block_rsv *rsv;
1798 struct btrfs_trans_handle *trans; 1898 struct btrfs_trans_handle *trans;
1799 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1899 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1800 u64 lockstart = (offset + mask) & ~mask; 1900 u64 lockend = round_down(offset + len,
1801 u64 lockend = ((offset + len) & ~mask) - 1; 1901 BTRFS_I(inode)->root->sectorsize) - 1;
1802 u64 cur_offset = lockstart; 1902 u64 cur_offset = lockstart;
1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1903 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1804 u64 drop_end; 1904 u64 drop_end;
1805 unsigned long nr;
1806 int ret = 0; 1905 int ret = 0;
1807 int err = 0; 1906 int err = 0;
1808 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1907 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1809 ((offset + len) >> PAGE_CACHE_SHIFT); 1908 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1810 1909
1811 btrfs_wait_ordered_range(inode, offset, len); 1910 btrfs_wait_ordered_range(inode, offset, len);
1812 1911
1813 mutex_lock(&inode->i_mutex); 1912 mutex_lock(&inode->i_mutex);
1814 if (offset >= inode->i_size) { 1913 /*
1815 mutex_unlock(&inode->i_mutex); 1914 * We needn't truncate any page which is beyond the end of the file
1816 return 0; 1915 * because we are sure there is no data there.
1817 } 1916 */
1818
1819 /* 1917 /*
1820 * Only do this if we are in the same page and we aren't doing the 1918 * Only do this if we are in the same page and we aren't doing the
1821 * entire page. 1919 * entire page.
1822 */ 1920 */
1823 if (same_page && len < PAGE_CACHE_SIZE) { 1921 if (same_page && len < PAGE_CACHE_SIZE) {
1824 ret = btrfs_truncate_page(inode, offset, len, 0); 1922 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1923 ret = btrfs_truncate_page(inode, offset, len, 0);
1825 mutex_unlock(&inode->i_mutex); 1924 mutex_unlock(&inode->i_mutex);
1826 return ret; 1925 return ret;
1827 } 1926 }
1828 1927
1829 /* zero back part of the first page */ 1928 /* zero back part of the first page */
1830 ret = btrfs_truncate_page(inode, offset, 0, 0); 1929 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1831 if (ret) { 1930 ret = btrfs_truncate_page(inode, offset, 0, 0);
1832 mutex_unlock(&inode->i_mutex); 1931 if (ret) {
1833 return ret; 1932 mutex_unlock(&inode->i_mutex);
1933 return ret;
1934 }
1834 } 1935 }
1835 1936
1836 /* zero the front end of the last page */ 1937 /* zero the front end of the last page */
1837 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1838 if (ret) { 1939 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1839 mutex_unlock(&inode->i_mutex); 1940 if (ret) {
1840 return ret; 1941 mutex_unlock(&inode->i_mutex);
1942 return ret;
1943 }
1841 } 1944 }
1842 1945
1843 if (lockend < lockstart) { 1946 if (lockend < lockstart) {
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1930 break; 2033 break;
1931 } 2034 }
1932 2035
1933 nr = trans->blocks_used;
1934 btrfs_end_transaction(trans, root); 2036 btrfs_end_transaction(trans, root);
1935 btrfs_btree_balance_dirty(root, nr); 2037 btrfs_btree_balance_dirty(root);
1936 2038
1937 trans = btrfs_start_transaction(root, 3); 2039 trans = btrfs_start_transaction(root, 3);
1938 if (IS_ERR(trans)) { 2040 if (IS_ERR(trans)) {
@@ -1963,11 +2065,13 @@ out_trans:
1963 if (!trans) 2065 if (!trans)
1964 goto out_free; 2066 goto out_free;
1965 2067
2068 inode_inc_iversion(inode);
2069 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2070
1966 trans->block_rsv = &root->fs_info->trans_block_rsv; 2071 trans->block_rsv = &root->fs_info->trans_block_rsv;
1967 ret = btrfs_update_inode(trans, root, inode); 2072 ret = btrfs_update_inode(trans, root, inode);
1968 nr = trans->blocks_used;
1969 btrfs_end_transaction(trans, root); 2073 btrfs_end_transaction(trans, root);
1970 btrfs_btree_balance_dirty(root, nr); 2074 btrfs_btree_balance_dirty(root);
1971out_free: 2075out_free:
1972 btrfs_free_path(path); 2076 btrfs_free_path(path);
1973 btrfs_free_block_rsv(root, rsv); 2077 btrfs_free_block_rsv(root, rsv);
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
1991 u64 alloc_end; 2095 u64 alloc_end;
1992 u64 alloc_hint = 0; 2096 u64 alloc_hint = 0;
1993 u64 locked_end; 2097 u64 locked_end;
1994 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1995 struct extent_map *em; 2098 struct extent_map *em;
2099 int blocksize = BTRFS_I(inode)->root->sectorsize;
1996 int ret; 2100 int ret;
1997 2101
1998 alloc_start = offset & ~mask; 2102 alloc_start = round_down(offset, blocksize);
1999 alloc_end = (offset + len + mask) & ~mask; 2103 alloc_end = round_up(offset + len, blocksize);
2000 2104
2001 /* Make sure we aren't being give some crap mode */ 2105 /* Make sure we aren't being give some crap mode */
2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2106 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2009 * Make sure we have enough space before we do the 2113 * Make sure we have enough space before we do the
2010 * allocation. 2114 * allocation.
2011 */ 2115 */
2012 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2116 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2013 if (ret) 2117 if (ret)
2014 return ret; 2118 return ret;
2015 2119
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2077 } 2181 }
2078 last_byte = min(extent_map_end(em), alloc_end); 2182 last_byte = min(extent_map_end(em), alloc_end);
2079 actual_end = min_t(u64, extent_map_end(em), offset + len); 2183 actual_end = min_t(u64, extent_map_end(em), offset + len);
2080 last_byte = (last_byte + mask) & ~mask; 2184 last_byte = ALIGN(last_byte, blocksize);
2081 2185
2082 if (em->block_start == EXTENT_MAP_HOLE || 2186 if (em->block_start == EXTENT_MAP_HOLE ||
2083 (cur_offset >= inode->i_size && 2187 (cur_offset >= inode->i_size &&
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2116out: 2220out:
2117 mutex_unlock(&inode->i_mutex); 2221 mutex_unlock(&inode->i_mutex);
2118 /* Let go of our reservation. */ 2222 /* Let go of our reservation. */
2119 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2223 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2120 return ret; 2224 return ret;
2121} 2225}
2122 2226
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
2292 .compat_ioctl = btrfs_ioctl, 2396 .compat_ioctl = btrfs_ioctl,
2293#endif 2397#endif
2294}; 2398};
2399
2400void btrfs_auto_defrag_exit(void)
2401{
2402 if (btrfs_inode_defrag_cachep)
2403 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2404}
2405
2406int btrfs_auto_defrag_init(void)
2407{
2408 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2409 sizeof(struct inode_defrag), 0,
2410 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2411 NULL);
2412 if (!btrfs_inode_defrag_cachep)
2413 return -ENOMEM;
2414
2415 return 0;
2416}