aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/aio.c65
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c18
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c109
-rw-r--r--fs/btrfs/ctree.h163
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c169
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2255
-rw-r--r--fs/btrfs/extent_io.c85
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c166
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1713
-rw-r--r--fs/btrfs/ioctl.c206
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1971
-rw-r--r--fs/btrfs/root-tree.c23
-rw-r--r--fs/btrfs/super.c35
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/compat.c132
-rw-r--r--fs/direct-io.c62
-rw-r--r--fs/exec.c195
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h167
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c417
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c35
-rw-r--r--fs/ext4/ialloc.c89
-rw-r--r--fs/ext4/inode.c723
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c120
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c13
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c80
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c39
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fuse/dev.c1
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/write.c20
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/qnx4/dir.c1
-rw-r--r--fs/quota/dquot.c11
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/dir.c1
-rw-r--r--fs/smbfs/dir.c1
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/udf/dir.c1
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/ufs_fs.h1
88 files changed, 7080 insertions, 3741 deletions
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
224 affs_brelse(bh); 224 affs_brelse(bh);
225 inode = affs_iget(sb, ino); 225 inode = affs_iget(sb, ino);
226 if (IS_ERR(inode)) 226 if (IS_ERR(inode))
227 return ERR_PTR(PTR_ERR(inode)); 227 return ERR_CAST(inode);
228 } 228 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; 229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 230 d_add(dentry, inode);
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..48fdeebdb544 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h> 37#include <linux/mempool.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/compat.h>
39 40
40#include <asm/kmap_types.h> 41#include <asm/kmap_types.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1384 return ret; 1385 return ret;
1385} 1386}
1386 1387
1387static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) 1388static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1388{ 1389{
1389 ssize_t ret; 1390 ssize_t ret;
1390 1391
1391 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, 1392#ifdef CONFIG_COMPAT
1392 kiocb->ki_nbytes, 1, 1393 if (compat)
1393 &kiocb->ki_inline_vec, &kiocb->ki_iovec); 1394 ret = compat_rw_copy_check_uvector(type,
1395 (struct compat_iovec __user *)kiocb->ki_buf,
1396 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1397 &kiocb->ki_iovec);
1398 else
1399#endif
1400 ret = rw_copy_check_uvector(type,
1401 (struct iovec __user *)kiocb->ki_buf,
1402 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1403 &kiocb->ki_iovec);
1394 if (ret < 0) 1404 if (ret < 0)
1395 goto out; 1405 goto out;
1396 1406
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1420 * Performs the initial checks and aio retry method 1430 * Performs the initial checks and aio retry method
1421 * setup for the kiocb at the time of io submission. 1431 * setup for the kiocb at the time of io submission.
1422 */ 1432 */
1423static ssize_t aio_setup_iocb(struct kiocb *kiocb) 1433static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1424{ 1434{
1425 struct file *file = kiocb->ki_filp; 1435 struct file *file = kiocb->ki_filp;
1426 ssize_t ret = 0; 1436 ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1469 ret = security_file_permission(file, MAY_READ); 1479 ret = security_file_permission(file, MAY_READ);
1470 if (unlikely(ret)) 1480 if (unlikely(ret))
1471 break; 1481 break;
1472 ret = aio_setup_vectored_rw(READ, kiocb); 1482 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1473 if (ret) 1483 if (ret)
1474 break; 1484 break;
1475 ret = -EINVAL; 1485 ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1483 ret = security_file_permission(file, MAY_WRITE); 1493 ret = security_file_permission(file, MAY_WRITE);
1484 if (unlikely(ret)) 1494 if (unlikely(ret))
1485 break; 1495 break;
1486 ret = aio_setup_vectored_rw(WRITE, kiocb); 1496 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1487 if (ret) 1497 if (ret)
1488 break; 1498 break;
1489 ret = -EINVAL; 1499 ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
1548} 1558}
1549 1559
1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1560static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 struct iocb *iocb, struct hlist_head *batch_hash) 1561 struct iocb *iocb, struct hlist_head *batch_hash,
1562 bool compat)
1552{ 1563{
1553 struct kiocb *req; 1564 struct kiocb *req;
1554 struct file *file; 1565 struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1620 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1610 req->ki_opcode = iocb->aio_lio_opcode; 1621 req->ki_opcode = iocb->aio_lio_opcode;
1611 1622
1612 ret = aio_setup_iocb(req); 1623 ret = aio_setup_iocb(req, compat);
1613 1624
1614 if (ret) 1625 if (ret)
1615 goto out_put_req; 1626 goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
1637 return ret; 1648 return ret;
1638} 1649}
1639 1650
1640/* sys_io_submit: 1651long do_io_submit(aio_context_t ctx_id, long nr,
1641 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1652 struct iocb __user *__user *iocbpp, bool compat)
1642 * the number of iocbs queued. May return -EINVAL if the aio_context
1643 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1644 * *iocbpp[0] is not properly initialized, if the operation specified
1645 * is invalid for the file descriptor in the iocb. May fail with
1646 * -EFAULT if any of the data structures point to invalid data. May
1647 * fail with -EBADF if the file descriptor specified in the first
1648 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1649 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1650 * fail with -ENOSYS if not implemented.
1651 */
1652SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1653 struct iocb __user * __user *, iocbpp)
1654{ 1653{
1655 struct kioctx *ctx; 1654 struct kioctx *ctx;
1656 long ret = 0; 1655 long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1687 break; 1686 break;
1688 } 1687 }
1689 1688
1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1689 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1691 if (ret) 1690 if (ret)
1692 break; 1691 break;
1693 } 1692 }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1697 return i ? i : ret; 1696 return i ? i : ret;
1698} 1697}
1699 1698
1699/* sys_io_submit:
1700 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1701 * the number of iocbs queued. May return -EINVAL if the aio_context
1702 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1703 * *iocbpp[0] is not properly initialized, if the operation specified
1704 * is invalid for the file descriptor in the iocb. May fail with
1705 * -EFAULT if any of the data structures point to invalid data. May
1706 * fail with -EBADF if the file descriptor specified in the first
1707 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1708 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1709 * fail with -ENOSYS if not implemented.
1710 */
1711SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1712 struct iocb __user * __user *, iocbpp)
1713{
1714 return do_io_submit(ctx_id, nr, iocbpp, 0);
1715}
1716
1700/* lookup_kiocb 1717/* lookup_kiocb
1701 * Finds a given iocb for cancellation. 1718 * Finds a given iocb for cancellation.
1702 */ 1719 */
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
29 29
30const struct file_operations autofs_root_operations = { 30const struct file_operations autofs_root_operations = {
31 .llseek = generic_file_llseek,
31 .read = generic_read_dir, 32 .read = generic_read_dir,
32 .readdir = autofs_root_readdir, 33 .readdir = autofs_root_readdir,
33 .ioctl = autofs_root_ioctl, 34 .ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 97{
98 struct autofs_dev_ioctl tmp, *ads; 98 struct autofs_dev_ioctl tmp;
99 99
100 if (copy_from_user(&tmp, in, sizeof(tmp))) 100 if (copy_from_user(&tmp, in, sizeof(tmp)))
101 return ERR_PTR(-EFAULT); 101 return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
103 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
104 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
105 105
106 ads = kmalloc(tmp.size, GFP_KERNEL); 106 return memdup_user(in, tmp.size);
107 if (!ads)
108 return ERR_PTR(-ENOMEM);
109
110 if (copy_from_user(ads, in, tmp.size)) {
111 kfree(ads);
112 return ERR_PTR(-EFAULT);
113 }
114
115 return ads;
116} 107}
117 108
118static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 109static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
736}; 727};
737 728
738static struct miscdevice _autofs_dev_ioctl_misc = { 729static struct miscdevice _autofs_dev_ioctl_misc = {
739 .minor = MISC_DYNAMIC_MINOR, 730 .minor = AUTOFS_MINOR,
740 .name = AUTOFS_DEVICE_NAME, 731 .name = AUTOFS_DEVICE_NAME,
741 .fops = &_dev_ioctl_fops 732 .fops = &_dev_ioctl_fops
742}; 733};
743 734
735MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
736MODULE_ALIAS("devname:autofs");
737
744/* Register/deregister misc character device */ 738/* Register/deregister misc character device */
745int autofs_dev_ioctl_init(void) 739int autofs_dev_ioctl_init(void)
746{ 740{
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
377 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
378 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
379 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
380 goto again; 381 goto again;
381 } 382 }
382 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
281 struct btrfs_root *root, 281 struct btrfs_root *root,
282 struct extent_buffer *buf, 282 struct extent_buffer *buf,
283 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
284{ 285{
285 u64 refs; 286 u64 refs;
286 u64 owner; 287 u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
366 BUG_ON(ret); 367 BUG_ON(ret);
367 } 368 }
368 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
369 } 371 }
370 return 0; 372 return 0;
371} 373}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
392 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
393 struct extent_buffer *cow; 395 struct extent_buffer *cow;
394 int level; 396 int level;
397 int last_ref = 0;
395 int unlock_orig = 0; 398 int unlock_orig = 0;
396 u64 parent_start; 399 u64 parent_start;
397 400
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
442 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
443 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
444 447
445 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
446 452
447 if (buf == root->node) { 453 if (buf == root->node) {
448 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
457 extent_buffer_get(cow); 463 extent_buffer_get(cow);
458 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
459 465
460 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
461 parent_start, root->root_key.objectid, level); 467 last_ref);
462 free_extent_buffer(buf); 468 free_extent_buffer(buf);
463 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
464 } else { 470 } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
473 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
474 trans->transid); 480 trans->transid);
475 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
476 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
477 parent_start, root->root_key.objectid, level); 483 last_ref);
478 } 484 }
479 if (unlock_orig) 485 if (unlock_orig)
480 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
949 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
950} 956}
951 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
952/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
953 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
954 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1019 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1020 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1021 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1022 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1023 1049
1024 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1025 root->node = child; 1051 root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1034 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1035 /* once for the path */ 1061 /* once for the path */
1036 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1037 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1038 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1039 /* once for the root ptr */ 1066 /* once for the root ptr */
1040 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1041 return ret; 1068 return 0;
1042 } 1069 }
1043 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1044 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1088 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1089 ret = wret; 1116 ret = wret;
1090 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1091 u64 bytenr = right->start;
1092 u32 blocksize = right->len;
1093
1094 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1095 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1096 free_extent_buffer(right);
1097 right = NULL;
1098 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1099 1); 1121 1);
1100 if (wret) 1122 if (wret)
1101 ret = wret; 1123 ret = wret;
1102 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1103 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1104 root->root_key.objectid, 1126 free_extent_buffer(right);
1105 level); 1127 right = NULL;
1106 if (wret)
1107 ret = wret;
1108 } else { 1128 } else {
1109 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1110 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1136 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1137 } 1157 }
1138 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1139 /* we've managed to empty the middle node, drop it */
1140 u64 bytenr = mid->start;
1141 u32 blocksize = mid->len;
1142
1143 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1144 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1145 free_extent_buffer(mid);
1146 mid = NULL;
1147 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1148 if (wret) 1162 if (wret)
1149 ret = wret; 1163 ret = wret;
1150 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1151 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1152 if (wret) 1166 free_extent_buffer(mid);
1153 ret = wret; 1167 mid = NULL;
1154 } else { 1168 } else {
1155 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1156 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1590 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1591 1605
1592 ret = -EAGAIN; 1606 ret = -EAGAIN;
1593 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1594 if (tmp) { 1608 if (tmp) {
1595 /* 1609 /*
1596 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
1740 p->nodes[level + 1], 1754 p->nodes[level + 1],
1741 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1742 if (err) { 1756 if (err) {
1743 free_extent_buffer(b);
1744 ret = err; 1757 ret = err;
1745 goto done; 1758 goto done;
1746 } 1759 }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2076 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2077 return PTR_ERR(c); 2090 return PTR_ERR(c);
2078 2091
2092 root_add_used(root, root->nodesize);
2093
2079 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2080 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2081 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2134 int nritems; 2149 int nritems;
2135 2150
2136 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2137 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2138 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2139 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2202 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2203 return PTR_ERR(split); 2219 return PTR_ERR(split);
2204 2220
2221 root_add_used(root, root->nodesize);
2222
2205 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2206 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2207 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2415 2433
2416 if (left_nritems) 2434 if (left_nritems)
2417 btrfs_mark_buffer_dirty(left); 2435 btrfs_mark_buffer_dirty(left);
2436 else
2437 clean_tree_block(trans, root, left);
2438
2418 btrfs_mark_buffer_dirty(right); 2439 btrfs_mark_buffer_dirty(right);
2419 2440
2420 btrfs_item_key(right, &disk_key, 0); 2441 btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2660 btrfs_mark_buffer_dirty(left); 2681 btrfs_mark_buffer_dirty(left);
2661 if (right_nritems) 2682 if (right_nritems)
2662 btrfs_mark_buffer_dirty(right); 2683 btrfs_mark_buffer_dirty(right);
2684 else
2685 clean_tree_block(trans, root, right);
2663 2686
2664 btrfs_item_key(right, &disk_key, 0); 2687 btrfs_item_key(right, &disk_key, 0);
2665 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2688 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2669 /* then fixup the leaf pointer in the path */ 2692 /* then fixup the leaf pointer in the path */
2670 if (path->slots[0] < push_items) { 2693 if (path->slots[0] < push_items) {
2671 path->slots[0] += old_left_nritems; 2694 path->slots[0] += old_left_nritems;
2672 if (btrfs_header_nritems(path->nodes[0]) == 0)
2673 clean_tree_block(trans, root, path->nodes[0]);
2674 btrfs_tree_unlock(path->nodes[0]); 2695 btrfs_tree_unlock(path->nodes[0]);
2675 free_extent_buffer(path->nodes[0]); 2696 free_extent_buffer(path->nodes[0]);
2676 path->nodes[0] = left; 2697 path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
2932 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2953 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2933 root->root_key.objectid, 2954 root->root_key.objectid,
2934 &disk_key, 0, l->start, 0); 2955 &disk_key, 0, l->start, 0);
2935 if (IS_ERR(right)) { 2956 if (IS_ERR(right))
2936 BUG_ON(1);
2937 return PTR_ERR(right); 2957 return PTR_ERR(right);
2938 } 2958
2959 root_add_used(root, root->leafsize);
2939 2960
2940 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2961 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2941 btrfs_set_header_bytenr(right, right->start); 2962 btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3054 3075
3055 btrfs_set_path_blocking(path); 3076 btrfs_set_path_blocking(path);
3056 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3077 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3057 BUG_ON(ret); 3078 if (ret)
3079 goto err;
3058 3080
3059 path->keep_locks = 0; 3081 path->keep_locks = 0;
3060 btrfs_unlock_up_safe(path, 1); 3082 btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3796 */ 3818 */
3797 btrfs_unlock_up_safe(path, 0); 3819 btrfs_unlock_up_safe(path, 0);
3798 3820
3799 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3821 root_sub_used(root, leaf->len);
3800 0, root->root_key.objectid, 0); 3822
3801 return ret; 3823 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3824 return 0;
3802} 3825}
3803/* 3826/*
3804 * delete the item at the leaf level in path. If that empties 3827 * delete the item at the leaf level in path. If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3865 if (leaf == root->node) { 3888 if (leaf == root->node) {
3866 btrfs_set_header_level(leaf, 0); 3889 btrfs_set_header_level(leaf, 0);
3867 } else { 3890 } else {
3891 btrfs_set_path_blocking(path);
3892 clean_tree_block(trans, root, leaf);
3868 ret = btrfs_del_leaf(trans, root, path, leaf); 3893 ret = btrfs_del_leaf(trans, root, path, leaf);
3869 BUG_ON(ret); 3894 BUG_ON(ret);
3870 } 3895 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..e9bf86415e86 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34 34
35struct btrfs_trans_handle; 35struct btrfs_trans_handle;
36struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
37extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
38extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
39extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
663#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
664#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
665#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
666 668
667struct btrfs_block_group_item { 669struct btrfs_block_group_item {
668 __le64 used; 670 __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
674 u64 flags; 676 u64 flags;
675 677
676 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
677 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
678 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
679 transaction finishes */ 682 transaction finishes */
680 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
681 current allocations */ 684 current allocations */
682 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
683 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
684 u64 bytes_root; /* the number of bytes needed to commit a
685 transaction */
686 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
687 delalloc/allocations */ 688 delalloc/allocations */
688 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
689 delayed allocation */
690 690
691 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 692 chunks for this space */
693 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
694 this space */ 694 this space */
695 int force_delalloc; /* make people start doing filemap_flush until
696 we're under a threshold */
697 695
698 struct list_head list; 696 struct list_head list;
699 697
700 /* for controlling how we free up space for allocations */
701 wait_queue_head_t allocate_wait;
702 wait_queue_head_t flush_wait;
703 int allocating_chunk;
704 int flushing;
705
706 /* for block groups in our same type */ 698 /* for block groups in our same type */
707 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
708 spinlock_t lock; 700 spinlock_t lock;
709 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
710 atomic_t caching_threads; 702 atomic_t caching_threads;
711}; 703};
712 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
713/* 719/*
714 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
715 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
760 spinlock_t lock; 766 spinlock_t lock;
761 u64 pinned; 767 u64 pinned;
762 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
763 u64 bytes_super; 770 u64 bytes_super;
764 u64 flags; 771 u64 flags;
765 u64 sectorsize; 772 u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
825 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
826 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
827 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
828 u64 generation; 851 u64 generation;
829 u64 last_trans_committed; 852 u64 last_trans_committed;
830 853
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
319} 319}
320 320
321/* 321/*
322 * helper function to lookup reference count and flags of extent.
323 *
324 * the head node for delayed ref is used to store the sum of all the
325 * reference count modifications queued up in the rbtree. the head
326 * node may also store the extent flags to set. This way you can check
327 * to see what the reference count and extent flags would be if all of
328 * the delayed refs are not processed.
329 */
330int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
331 struct btrfs_root *root, u64 bytenr,
332 u64 num_bytes, u64 *refs, u64 *flags)
333{
334 struct btrfs_delayed_ref_node *ref;
335 struct btrfs_delayed_ref_head *head;
336 struct btrfs_delayed_ref_root *delayed_refs;
337 struct btrfs_path *path;
338 struct btrfs_extent_item *ei;
339 struct extent_buffer *leaf;
340 struct btrfs_key key;
341 u32 item_size;
342 u64 num_refs;
343 u64 extent_flags;
344 int ret;
345
346 path = btrfs_alloc_path();
347 if (!path)
348 return -ENOMEM;
349
350 key.objectid = bytenr;
351 key.type = BTRFS_EXTENT_ITEM_KEY;
352 key.offset = num_bytes;
353 delayed_refs = &trans->transaction->delayed_refs;
354again:
355 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
356 &key, path, 0, 0);
357 if (ret < 0)
358 goto out;
359
360 if (ret == 0) {
361 leaf = path->nodes[0];
362 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
363 if (item_size >= sizeof(*ei)) {
364 ei = btrfs_item_ptr(leaf, path->slots[0],
365 struct btrfs_extent_item);
366 num_refs = btrfs_extent_refs(leaf, ei);
367 extent_flags = btrfs_extent_flags(leaf, ei);
368 } else {
369#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
370 struct btrfs_extent_item_v0 *ei0;
371 BUG_ON(item_size != sizeof(*ei0));
372 ei0 = btrfs_item_ptr(leaf, path->slots[0],
373 struct btrfs_extent_item_v0);
374 num_refs = btrfs_extent_refs_v0(leaf, ei0);
375 /* FIXME: this isn't correct for data */
376 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
377#else
378 BUG();
379#endif
380 }
381 BUG_ON(num_refs == 0);
382 } else {
383 num_refs = 0;
384 extent_flags = 0;
385 ret = 0;
386 }
387
388 spin_lock(&delayed_refs->lock);
389 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
390 if (ref) {
391 head = btrfs_delayed_node_to_head(ref);
392 if (!mutex_trylock(&head->mutex)) {
393 atomic_inc(&ref->refs);
394 spin_unlock(&delayed_refs->lock);
395
396 btrfs_release_path(root->fs_info->extent_root, path);
397
398 mutex_lock(&head->mutex);
399 mutex_unlock(&head->mutex);
400 btrfs_put_delayed_ref(ref);
401 goto again;
402 }
403 if (head->extent_op && head->extent_op->update_flags)
404 extent_flags |= head->extent_op->flags_to_set;
405 else
406 BUG_ON(num_refs == 0);
407
408 num_refs += ref->ref_mod;
409 mutex_unlock(&head->mutex);
410 }
411 WARN_ON(num_refs == 0);
412 if (refs)
413 *refs = num_refs;
414 if (flags)
415 *flags = extent_flags;
416out:
417 spin_unlock(&delayed_refs->lock);
418 btrfs_free_path(path);
419 return ret;
420}
421
422/*
423 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
424 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
425 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..f3b287c22caf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
74 int rw; 74 int rw;
75 int mirror_num; 75 int mirror_num;
76 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
77 struct btrfs_work work; 82 struct btrfs_work work;
78}; 83};
79 84
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
534 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
535 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
536 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
537 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
538} 544}
539 545
540static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
556 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
557 563
558 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
559 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
560} 567}
561 568
562static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
570int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
571 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
572 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
573 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
574 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
575{ 583{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
592 600
593 async->work.flags = 0; 601 async->work.flags = 0;
594 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
595 604
596 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
597 606
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
627 636
628static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
629 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
630 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
631{ 641{
632 /* 642 /*
633 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
638} 648}
639 649
640static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
641 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
642{ 653{
643 /* 654 /*
644 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
648} 659}
649 660
650static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
651 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
652{ 664{
653 int ret; 665 int ret;
654 666
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
671 */ 683 */
672 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
673 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
674 __btree_submit_bio_start, 687 __btree_submit_bio_start,
675 __btree_submit_bio_done); 688 __btree_submit_bio_done);
676} 689}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
894 root->ref_cows = 0; 907 root->ref_cows = 0;
895 root->track_dirty = 0; 908 root->track_dirty = 0;
896 root->in_radix = 0; 909 root->in_radix = 0;
897 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
898 912
899 root->fs_info = fs_info; 913 root->fs_info = fs_info;
900 root->objectid = objectid; 914 root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
903 root->name = NULL; 917 root->name = NULL;
904 root->in_sysfs = 0; 918 root->in_sysfs = 0;
905 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
906 922
907 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
908 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
909 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
910 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
911 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
912 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
913 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
914 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
915 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
968 return 0; 985 return 0;
969} 986}
970 987
971int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
972 struct btrfs_fs_info *fs_info)
973{
974 struct extent_buffer *eb;
975 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
976 u64 start = 0;
977 u64 end = 0;
978 int ret;
979
980 if (!log_root_tree)
981 return 0;
982
983 while (1) {
984 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
985 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
986 if (ret)
987 break;
988
989 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
990 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
991 }
992 eb = fs_info->log_root_tree->node;
993
994 WARN_ON(btrfs_header_level(eb) != 0);
995 WARN_ON(btrfs_header_nritems(eb) != 0);
996
997 ret = btrfs_free_reserved_extent(fs_info->tree_root,
998 eb->start, eb->len);
999 BUG_ON(ret);
1000
1001 free_extent_buffer(eb);
1002 kfree(fs_info->log_root_tree);
1003 fs_info->log_root_tree = NULL;
1004 return 0;
1005}
1006
1007static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1008 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1009{ 990{
@@ -1191,19 +1172,23 @@ again:
1191 if (root) 1172 if (root)
1192 return root; 1173 return root;
1193 1174
1194 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1195 if (ret == 0)
1196 ret = -ENOENT;
1197 if (ret < 0)
1198 return ERR_PTR(ret);
1199
1200 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1201 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1202 return root; 1177 return root;
1203 1178
1204 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1205 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1206 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1207 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1208 if (ret) 1193 if (ret)
1209 goto fail; 1194 goto fail;
@@ -1212,10 +1197,9 @@ again:
1212 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1213 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1214 root); 1199 root);
1215 if (ret == 0) { 1200 if (ret == 0)
1216 root->in_radix = 1; 1201 root->in_radix = 1;
1217 root->clean_orphans = 1; 1202
1218 }
1219 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1220 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1221 if (ret) { 1205 if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1461 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1462 1446
1463 do { 1447 do {
1464 smp_mb();
1465 if (root->fs_info->closing)
1466 break;
1467
1468 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1469 1449
1470 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1477 if (freezing(current)) { 1457 if (freezing(current)) {
1478 refrigerator(); 1458 refrigerator();
1479 } else { 1459 } else {
1480 smp_mb();
1481 if (root->fs_info->closing)
1482 break;
1483 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1484 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1485 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1486 } 1464 }
1487 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
1493 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1494 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1495 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1496 unsigned long now; 1475 unsigned long now;
1497 unsigned long delay; 1476 unsigned long delay;
1498 int ret; 1477 int ret;
1499 1478
1500 do { 1479 do {
1501 smp_mb();
1502 if (root->fs_info->closing)
1503 break;
1504
1505 delay = HZ * 30; 1480 delay = HZ * 30;
1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1507 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1508 1483
1509 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1510 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1511 if (!cur) { 1486 if (!cur) {
1512 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1513 goto sleep; 1488 goto sleep;
1514 } 1489 }
1515 1490
1516 now = get_seconds(); 1491 now = get_seconds();
1517 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1518 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1519 delay = HZ * 5; 1495 delay = HZ * 5;
1520 goto sleep; 1496 goto sleep;
1521 } 1497 }
1522 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1523 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1524 ret = btrfs_commit_transaction(trans, root);
1525 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1526sleep: 1508sleep:
1527 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1528 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
1530 if (freezing(current)) { 1512 if (freezing(current)) {
1531 refrigerator(); 1513 refrigerator();
1532 } else { 1514 } else {
1533 if (root->fs_info->closing)
1534 break;
1535 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1536 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1537 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1538 } 1520 }
1539 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1620 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1621 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1622 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1623 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1624 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1625 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1759 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1760 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1761 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1762 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1763 fs_info->thread_pool_size,
1764 &fs_info->generic_worker);
1765 1751
1766 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1767 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1809 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1810 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1811 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1812 btrfs_start_workers(&fs_info->enospc_workers, 1);
1813 1798
1814 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1815 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1912 1897
1913 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1914 1899
1900 fs_info->generation = generation;
1901 fs_info->last_trans_committed = generation;
1902 fs_info->data_alloc_profile = (u64)-1;
1903 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1915 ret = btrfs_read_block_groups(extent_root); 1906 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) { 1907 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups; 1909 goto fail_block_groups;
1919 } 1910 }
1920 1911
1921 fs_info->generation = generation;
1922 fs_info->last_trans_committed = generation;
1923 fs_info->data_alloc_profile = (u64)-1;
1924 fs_info->metadata_alloc_profile = (u64)-1;
1925 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1927 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1928 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
@@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1977 BUG_ON(ret); 1963 BUG_ON(ret);
1978 1964
1979 if (!(sb->s_flags & MS_RDONLY)) { 1965 if (!(sb->s_flags & MS_RDONLY)) {
1966 ret = btrfs_cleanup_fs_roots(fs_info);
1967 BUG_ON(ret);
1968
1980 ret = btrfs_recover_relocation(tree_root); 1969 ret = btrfs_recover_relocation(tree_root);
1981 if (ret < 0) { 1970 if (ret < 0) {
1982 printk(KERN_WARNING 1971 printk(KERN_WARNING
@@ -2040,7 +2029,6 @@ fail_sb_buffer:
2040 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2029 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_write_workers); 2030 btrfs_stop_workers(&fs_info->endio_write_workers);
2042 btrfs_stop_workers(&fs_info->submit_workers); 2031 btrfs_stop_workers(&fs_info->submit_workers);
2043 btrfs_stop_workers(&fs_info->enospc_workers);
2044fail_iput: 2032fail_iput:
2045 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2033 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2046 iput(fs_info->btree_inode); 2034 iput(fs_info->btree_inode);
@@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2405 down_write(&root->fs_info->cleanup_work_sem); 2393 down_write(&root->fs_info->cleanup_work_sem);
2406 up_write(&root->fs_info->cleanup_work_sem); 2394 up_write(&root->fs_info->cleanup_work_sem);
2407 2395
2408 trans = btrfs_start_transaction(root, 1); 2396 trans = btrfs_join_transaction(root, 1);
2409 ret = btrfs_commit_transaction(trans, root); 2397 ret = btrfs_commit_transaction(trans, root);
2410 BUG_ON(ret); 2398 BUG_ON(ret);
2411 /* run commit again to drop the original snapshot */ 2399 /* run commit again to drop the original snapshot */
2412 trans = btrfs_start_transaction(root, 1); 2400 trans = btrfs_join_transaction(root, 1);
2413 btrfs_commit_transaction(trans, root); 2401 btrfs_commit_transaction(trans, root);
2414 ret = btrfs_write_and_wait_transaction(NULL, root); 2402 ret = btrfs_write_and_wait_transaction(NULL, root);
2415 BUG_ON(ret); 2403 BUG_ON(ret);
@@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
2426 fs_info->closing = 1; 2414 fs_info->closing = 1;
2427 smp_mb(); 2415 smp_mb();
2428 2416
2429 kthread_stop(root->fs_info->transaction_kthread);
2430 kthread_stop(root->fs_info->cleaner_kthread);
2431
2432 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2417 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2433 ret = btrfs_commit_super(root); 2418 ret = btrfs_commit_super(root);
2434 if (ret) 2419 if (ret)
2435 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2420 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2436 } 2421 }
2437 2422
2423 kthread_stop(root->fs_info->transaction_kthread);
2424 kthread_stop(root->fs_info->cleaner_kthread);
2425
2438 fs_info->closing = 2; 2426 fs_info->closing = 2;
2439 smp_mb(); 2427 smp_mb();
2440 2428
@@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
2473 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2461 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2474 btrfs_stop_workers(&fs_info->endio_write_workers); 2462 btrfs_stop_workers(&fs_info->endio_write_workers);
2475 btrfs_stop_workers(&fs_info->submit_workers); 2463 btrfs_stop_workers(&fs_info->submit_workers);
2476 btrfs_stop_workers(&fs_info->enospc_workers);
2477 2464
2478 btrfs_close_devices(fs_info->fs_devices); 2465 btrfs_close_devices(fs_info->fs_devices);
2479 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2466 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..b9080d71991a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
35 35
36static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 37 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
39 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
41 u64 num_bytes, int reserve);
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43 struct btrfs_root *root, 42 struct btrfs_root *root,
44 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 struct btrfs_path *path,
67 u64 bytenr, u64 num_bytes,
68 int is_data, int reserved,
69 struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
71 struct btrfs_key *key); 64 struct btrfs_key *key);
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
91 84
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{ 86{
94 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
95 kfree(cache); 91 kfree(cache);
92 }
96} 93}
97 94
98/* 95/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
319 316
320 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
321 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
322 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
323 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
324 321
325 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
507 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
508 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
509 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
510 rcu_read_lock(); 510 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
610} 610}
611 611
612/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
613 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
614 * 721 *
615 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1871 return ret; 1978 return ret;
1872} 1979}
1873 1980
1874
1875/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1876static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1877 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1891 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1892 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1893 if (insert_reserved) { 1999 if (insert_reserved) {
1894 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1895 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1896
1897 ret = pin_down_bytes(trans, root, NULL,
1898 node->bytenr, node->num_bytes,
1899 head->is_data, 1, &must_clean);
1900 if (ret > 0)
1901 mark_free = 1;
1902
1903 if (must_clean) {
1904 clean_tree_block(NULL, root, must_clean);
1905 btrfs_tree_unlock(must_clean);
1906 free_extent_buffer(must_clean);
1907 }
1908 if (head->is_data) { 2002 if (head->is_data) {
1909 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1910 node->bytenr, 2004 node->bytenr,
1911 node->num_bytes); 2005 node->num_bytes);
1912 BUG_ON(ret); 2006 BUG_ON(ret);
1913 } 2007 }
1914 if (mark_free) {
1915 ret = btrfs_free_reserved_extent(root,
1916 node->bytenr,
1917 node->num_bytes);
1918 BUG_ON(ret);
1919 }
1920 } 2008 }
1921 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1922 return 0; 2010 return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2347 ret = 0; 2435 ret = 0;
2348out: 2436out:
2349 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2350 return ret; 2440 return ret;
2351} 2441}
2352 2442
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2660 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2661{ 2751{
2662 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2663 2761
2664 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2665 if (found) { 2763 if (found) {
2666 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2667 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2668 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2669 found->full = 0; 2768 found->full = 0;
2670 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2671 *space_info = found; 2770 *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2675 if (!found) 2774 if (!found)
2676 return -ENOMEM; 2775 return -ENOMEM;
2677 2776
2678 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2679 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2682 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2683 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2684 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2685 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2686 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2687 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2688 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2689 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2690 found->full = 0; 2791 found->full = 0;
2691 found->force_alloc = 0; 2792 found->force_alloc = 0;
2692 *space_info = found; 2793 *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2711 } 2812 }
2712} 2813}
2713 2814
2714static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2715{
2716 spin_lock(&cache->space_info->lock);
2717 spin_lock(&cache->lock);
2718 if (!cache->ro) {
2719 cache->space_info->bytes_readonly += cache->key.offset -
2720 btrfs_block_group_used(&cache->item);
2721 cache->ro = 1;
2722 }
2723 spin_unlock(&cache->lock);
2724 spin_unlock(&cache->space_info->lock);
2725}
2726
2727u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2728{ 2816{
2729 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2752 return flags; 2840 return flags;
2753} 2841}
2754 2842
2755static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2756{
2757 struct btrfs_fs_info *info = root->fs_info;
2758 u64 alloc_profile;
2759
2760 if (data) {
2761 alloc_profile = info->avail_data_alloc_bits &
2762 info->data_alloc_profile;
2763 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2764 } else if (root == root->fs_info->chunk_root) {
2765 alloc_profile = info->avail_system_alloc_bits &
2766 info->system_alloc_profile;
2767 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2768 } else {
2769 alloc_profile = info->avail_metadata_alloc_bits &
2770 info->metadata_alloc_profile;
2771 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2772 }
2773
2774 return btrfs_reduce_alloc_profile(root, data);
2775}
2776
2777void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2778{
2779 u64 alloc_target;
2780
2781 alloc_target = btrfs_get_alloc_profile(root, 1);
2782 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2783 alloc_target);
2784}
2785
2786static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2787{
2788 u64 num_bytes;
2789 int level;
2790
2791 level = BTRFS_MAX_LEVEL - 2;
2792 /*
2793 * NOTE: these calculations are absolutely the worst possible case.
2794 * This assumes that _every_ item we insert will require a new leaf, and
2795 * that the tree has grown to its maximum level size.
2796 */
2797
2798 /*
2799 * for every item we insert we could insert both an extent item and a
2800 * extent ref item. Then for ever item we insert, we will need to cow
2801 * both the original leaf, plus the leaf to the left and right of it.
2802 *
2803 * Unless we are talking about the extent root, then we just want the
2804 * number of items * 2, since we just need the extent item plus its ref.
2805 */
2806 if (root == root->fs_info->extent_root)
2807 num_bytes = num_items * 2;
2808 else
2809 num_bytes = (num_items + (2 * num_items)) * 3;
2810
2811 /*
2812 * num_bytes is total number of leaves we could need times the leaf
2813 * size, and then for every leaf we could end up cow'ing 2 nodes per
2814 * level, down to the leaf level.
2815 */
2816 num_bytes = (num_bytes * root->leafsize) +
2817 (num_bytes * (level * 2)) * root->nodesize;
2818
2819 return num_bytes;
2820}
2821
2822/*
2823 * Unreserve metadata space for delalloc. If we have less reserved credits than
2824 * we have extents, this function does nothing.
2825 */
2826int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2827 struct inode *inode, int num_items)
2828{
2829 struct btrfs_fs_info *info = root->fs_info;
2830 struct btrfs_space_info *meta_sinfo;
2831 u64 num_bytes;
2832 u64 alloc_target;
2833 bool bug = false;
2834
2835 /* get the space info for where the metadata will live */
2836 alloc_target = btrfs_get_alloc_profile(root, 0);
2837 meta_sinfo = __find_space_info(info, alloc_target);
2838
2839 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2840 num_items);
2841
2842 spin_lock(&meta_sinfo->lock);
2843 spin_lock(&BTRFS_I(inode)->accounting_lock);
2844 if (BTRFS_I(inode)->reserved_extents <=
2845 BTRFS_I(inode)->outstanding_extents) {
2846 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2847 spin_unlock(&meta_sinfo->lock);
2848 return 0;
2849 }
2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2851
2852 BTRFS_I(inode)->reserved_extents -= num_items;
2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2854
2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
2856 bug = true;
2857 meta_sinfo->bytes_delalloc = 0;
2858 } else {
2859 meta_sinfo->bytes_delalloc -= num_bytes;
2860 }
2861 spin_unlock(&meta_sinfo->lock);
2862
2863 BUG_ON(bug);
2864
2865 return 0;
2866}
2867
2868static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2869{ 2844{
2870 u64 thresh; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2871 2846 flags |= root->fs_info->avail_data_alloc_bits &
2872 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2847 root->fs_info->data_alloc_profile;
2873 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2874 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2849 flags |= root->fs_info->avail_system_alloc_bits &
2875 meta_sinfo->bytes_may_use; 2850 root->fs_info->system_alloc_profile;
2876 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2877 thresh = meta_sinfo->total_bytes - thresh; 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2878 thresh *= 80; 2853 root->fs_info->metadata_alloc_profile;
2879 do_div(thresh, 100); 2854 return btrfs_reduce_alloc_profile(root, flags);
2880 if (thresh <= meta_sinfo->bytes_delalloc)
2881 meta_sinfo->force_delalloc = 1;
2882 else
2883 meta_sinfo->force_delalloc = 0;
2884} 2855}
2885 2856
2886struct async_flush { 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2887 struct btrfs_root *root;
2888 struct btrfs_space_info *info;
2889 struct btrfs_work work;
2890};
2891
2892static noinline void flush_delalloc_async(struct btrfs_work *work)
2893{ 2858{
2894 struct async_flush *async; 2859 u64 flags;
2895 struct btrfs_root *root;
2896 struct btrfs_space_info *info;
2897
2898 async = container_of(work, struct async_flush, work);
2899 root = async->root;
2900 info = async->info;
2901
2902 btrfs_start_delalloc_inodes(root, 0);
2903 wake_up(&info->flush_wait);
2904 btrfs_wait_ordered_extents(root, 0, 0);
2905
2906 spin_lock(&info->lock);
2907 info->flushing = 0;
2908 spin_unlock(&info->lock);
2909 wake_up(&info->flush_wait);
2910
2911 kfree(async);
2912}
2913
2914static void wait_on_flush(struct btrfs_space_info *info)
2915{
2916 DEFINE_WAIT(wait);
2917 u64 used;
2918
2919 while (1) {
2920 prepare_to_wait(&info->flush_wait, &wait,
2921 TASK_UNINTERRUPTIBLE);
2922 spin_lock(&info->lock);
2923 if (!info->flushing) {
2924 spin_unlock(&info->lock);
2925 break;
2926 }
2927
2928 used = info->bytes_used + info->bytes_reserved +
2929 info->bytes_pinned + info->bytes_readonly +
2930 info->bytes_super + info->bytes_root +
2931 info->bytes_may_use + info->bytes_delalloc;
2932 if (used < info->total_bytes) {
2933 spin_unlock(&info->lock);
2934 break;
2935 }
2936 spin_unlock(&info->lock);
2937 schedule();
2938 }
2939 finish_wait(&info->flush_wait, &wait);
2940}
2941
2942static void flush_delalloc(struct btrfs_root *root,
2943 struct btrfs_space_info *info)
2944{
2945 struct async_flush *async;
2946 bool wait = false;
2947
2948 spin_lock(&info->lock);
2949 2860
2950 if (!info->flushing) 2861 if (data)
2951 info->flushing = 1; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2863 else if (root == root->fs_info->chunk_root)
2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2952 else 2865 else
2953 wait = true; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2954
2955 spin_unlock(&info->lock);
2956
2957 if (wait) {
2958 wait_on_flush(info);
2959 return;
2960 }
2961
2962 async = kzalloc(sizeof(*async), GFP_NOFS);
2963 if (!async)
2964 goto flush;
2965
2966 async->root = root;
2967 async->info = info;
2968 async->work.func = flush_delalloc_async;
2969 2867
2970 btrfs_queue_worker(&root->fs_info->enospc_workers, 2868 return get_alloc_profile(root, flags);
2971 &async->work);
2972 wait_on_flush(info);
2973 return;
2974
2975flush:
2976 btrfs_start_delalloc_inodes(root, 0);
2977 btrfs_wait_ordered_extents(root, 0, 0);
2978
2979 spin_lock(&info->lock);
2980 info->flushing = 0;
2981 spin_unlock(&info->lock);
2982 wake_up(&info->flush_wait);
2983} 2869}
2984 2870
2985static int maybe_allocate_chunk(struct btrfs_root *root, 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2986 struct btrfs_space_info *info)
2987{
2988 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2989 struct btrfs_trans_handle *trans;
2990 bool wait = false;
2991 int ret = 0;
2992 u64 min_metadata;
2993 u64 free_space;
2994
2995 free_space = btrfs_super_total_bytes(disk_super);
2996 /*
2997 * we allow the metadata to grow to a max of either 10gb or 5% of the
2998 * space in the volume.
2999 */
3000 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3001 div64_u64(free_space * 5, 100));
3002 if (info->total_bytes >= min_metadata) {
3003 spin_unlock(&info->lock);
3004 return 0;
3005 }
3006
3007 if (info->full) {
3008 spin_unlock(&info->lock);
3009 return 0;
3010 }
3011
3012 if (!info->allocating_chunk) {
3013 info->force_alloc = 1;
3014 info->allocating_chunk = 1;
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113
3114 BTRFS_I(inode)->reserved_extents += num_items;
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122}
3123
3124/*
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
3238 u64 used; 2885 u64 used;
3239 int ret = 0, committed = 0, flushed = 0; 2886 int ret = 0, committed = 0;
3240 2887
3241 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3248again: 2895again:
3249 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3250 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + 2900 data_sinfo->bytes_may_use;
3254 data_sinfo->bytes_super;
3255 2901
3256 if (used + bytes > data_sinfo->total_bytes) { 2902 if (used + bytes > data_sinfo->total_bytes) {
3257 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3258 2904
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3266 /* 2905 /*
3267 * if we don't have enough free bytes in this space then we need 2906 * if we don't have enough free bytes in this space then we need
3268 * to alloc a new chunk. 2907 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
3274 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3275alloc: 2914alloc:
3276 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3277 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3278 if (!trans) 2917 if (IS_ERR(trans))
3279 return -ENOMEM; 2918 return PTR_ERR(trans);
3280 2919
3281 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3282 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3283 alloc_target, 0); 2922 alloc_target, 0);
3284 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3285 if (ret) 2924 if (ret < 0)
3286 return ret; 2925 return ret;
3287 2926
3288 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
3297 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3298 committed = 1; 2937 committed = 1;
3299 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3300 if (!trans) 2939 if (IS_ERR(trans))
3301 return -ENOMEM; 2940 return PTR_ERR(trans);
3302 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3303 if (ret) 2942 if (ret)
3304 return ret; 2943 return ret;
3305 goto again; 2944 goto again;
3306 } 2945 }
3307 2946
3308 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3309 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3310 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3311 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3312 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3313 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3314 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3315 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3316 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3317 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3318 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3319 return -ENOSPC; 2959 return -ENOSPC;
3320 } 2960 }
3321 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
3326} 2966}
3327 2967
3328/* 2968/*
3329 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3330 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3331 */ 2972 */
3332void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3333 struct inode *inode, u64 bytes)
3334{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3335 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3336 2977
3337 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3344 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3345} 2986}
3346 2987
3347/* called when we are adding a delalloc extent to the inode's io_tree */
3348void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3349 u64 bytes)
3350{
3351 struct btrfs_space_info *data_sinfo;
3352
3353 /* get the space info for where this inode will be storing its data */
3354 data_sinfo = BTRFS_I(inode)->space_info;
3355
3356 /* make sure we have enough space to handle the data first */
3357 spin_lock(&data_sinfo->lock);
3358 data_sinfo->bytes_delalloc += bytes;
3359
3360 /*
3361 * we are adding a delalloc extent without calling
3362 * btrfs_check_data_free_space first. This happens on a weird
3363 * writepage condition, but shouldn't hurt our accounting
3364 */
3365 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3366 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3367 BTRFS_I(inode)->reserved_bytes = 0;
3368 } else {
3369 data_sinfo->bytes_may_use -= bytes;
3370 BTRFS_I(inode)->reserved_bytes -= bytes;
3371 }
3372
3373 spin_unlock(&data_sinfo->lock);
3374}
3375
3376/* called when we are clearing an delalloc extent from the inode's io_tree */
3377void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3378 u64 bytes)
3379{
3380 struct btrfs_space_info *info;
3381
3382 info = BTRFS_I(inode)->space_info;
3383
3384 spin_lock(&info->lock);
3385 info->bytes_delalloc -= bytes;
3386 spin_unlock(&info->lock);
3387}
3388
3389static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3390{ 2989{
3391 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3399 rcu_read_unlock(); 2998 rcu_read_unlock();
3400} 2999}
3401 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3402static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3403 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3404 u64 flags, int force) 3019 u64 flags, int force)
3405{ 3020{
3406 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3407 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3408 u64 thresh;
3409 int ret = 0; 3023 int ret = 0;
3410 3024
3411 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3428 goto out; 3042 goto out;
3429 } 3043 }
3430 3044
3431 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3432 thresh = div_factor(thresh, 8);
3433 if (!force &&
3434 (space_info->bytes_used + space_info->bytes_pinned +
3435 space_info->bytes_reserved + alloc_bytes) < thresh) {
3436 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3437 goto out; 3047 goto out;
3438 } 3048 }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3454 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3455 if (ret) 3065 if (ret)
3456 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3457 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3458 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3459out: 3071out:
@@ -3461,13 +3073,713 @@ out:
3461 return ret; 3073 return ret;
3462} 3074}
3463 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3464static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3465 struct btrfs_root *root, 3777 struct btrfs_root *root,
3466 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3467 int mark_free)
3468{ 3779{
3469 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3470 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3471 u64 total = num_bytes; 3783 u64 total = num_bytes;
3472 u64 old_val; 3784 u64 old_val;
3473 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3486 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3487 if (!cache) 3799 if (!cache)
3488 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3489 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3490 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3491 3809
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3498 old_val += num_bytes; 3816 old_val += num_bytes;
3499 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3500 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3501 cache->space_info->bytes_used += num_bytes;
3502 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3503 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3504 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3505 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3506 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3507 } else { 3824 } else {
3508 old_val -= num_bytes; 3825 old_val -= num_bytes;
3509 cache->space_info->bytes_used -= num_bytes;
3510 if (cache->ro)
3511 cache->space_info->bytes_readonly += num_bytes;
3512 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3513 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3514 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3515 if (mark_free) {
3516 int ret;
3517 3833
3518 ret = btrfs_discard_extent(root, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3519 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3520 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3521
3522 ret = btrfs_add_free_space(cache, bytenr,
3523 num_bytes);
3524 WARN_ON(ret);
3525 }
3526 } 3837 }
3527 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3528 total -= num_bytes; 3839 total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3546 return bytenr; 3857 return bytenr;
3547} 3858}
3548 3859
3549/* 3860static int pin_down_extent(struct btrfs_root *root,
3550 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3551 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3552int btrfs_pin_extent(struct btrfs_root *root,
3553 u64 bytenr, u64 num_bytes, int reserved)
3554{ 3863{
3555 struct btrfs_fs_info *fs_info = root->fs_info;
3556 struct btrfs_block_group_cache *cache;
3557
3558 cache = btrfs_lookup_block_group(fs_info, bytenr);
3559 BUG_ON(!cache);
3560
3561 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3562 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3563 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3569 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3570 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3571 3874
3572 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3879
3880/*
3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3573 3892
3574 set_extent_dirty(fs_info->pinned_extents, 3893 btrfs_put_block_group(cache);
3575 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3576 return 0; 3894 return 0;
3577} 3895}
3578 3896
3579static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3580 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3581{ 3903{
3582 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3583 spin_lock(&cache->lock); 3905 if (sinfo) {
3584 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3585 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3586 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3587 } else { 3924 } else {
3588 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3589 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3590 } 3935 }
3591 spin_unlock(&cache->lock); 3936 return ret;
3592 spin_unlock(&cache->space_info->lock);
3593 return 0;
3594} 3937}
3595 3938
3596int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3621 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3622 3965
3623 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3624 return 0; 3969 return 0;
3625} 3970}
3626 3971
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3647 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3648 } 3993 }
3649 3994
3995 start += len;
3996
3650 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3651 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3652 cache->pinned -= len; 3999 cache->pinned -= len;
3653 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3654 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3655 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3656
3657 start += len;
3658 } 4010 }
3659 4011
3660 if (cache) 4012 if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3667{ 4019{
3668 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3669 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3670 u64 start; 4024 u64 start;
3671 u64 end; 4025 u64 end;
4026 int idx;
3672 int ret; 4027 int ret;
3673 4028
3674 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3689 cond_resched(); 4044 cond_resched();
3690 } 4045 }
3691 4046
3692 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3693} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
4049 &fs_info->durable_block_rsv_list, list) {
3694 4050
3695static int pin_down_bytes(struct btrfs_trans_handle *trans, 4051 idx = trans->transid & 0x1;
3696 struct btrfs_root *root, 4052 if (block_rsv->freed[idx] > 0) {
3697 struct btrfs_path *path, 4053 block_rsv_add_bytes(block_rsv,
3698 u64 bytenr, u64 num_bytes, 4054 block_rsv->freed[idx], 0);
3699 int is_data, int reserved, 4055 block_rsv->freed[idx] = 0;
3700 struct extent_buffer **must_clean) 4056 }
3701{ 4057 if (atomic_read(&block_rsv->usage) == 0) {
3702 int err = 0; 4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3703 struct extent_buffer *buf;
3704
3705 if (is_data)
3706 goto pinit;
3707
3708 /*
3709 * discard is sloooow, and so triggering discards on
3710 * individual btree blocks isn't a good plan. Just
3711 * pin everything in discard mode.
3712 */
3713 if (btrfs_test_opt(root, DISCARD))
3714 goto pinit;
3715
3716 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3717 if (!buf)
3718 goto pinit;
3719 4059
3720 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3721 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3722 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3723 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3724 */ 4064 }
3725 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3726 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3727 u64 header_owner = btrfs_header_owner(buf);
3728 u64 header_transid = btrfs_header_generation(buf);
3729 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3730 header_transid == trans->transid &&
3731 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3732 *must_clean = buf;
3733 return 1;
3734 } 4067 }
3735 btrfs_tree_unlock(buf);
3736 } 4068 }
3737 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3738pinit:
3739 if (path)
3740 btrfs_set_path_blocking(path);
3741 /* unlocks the pinned mutex */
3742 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3743 4070
3744 BUG_ON(err < 0);
3745 return 0; 4071 return 0;
3746} 4072}
3747 4073
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3902 BUG_ON(ret); 4228 BUG_ON(ret);
3903 } 4229 }
3904 } else { 4230 } else {
3905 int mark_free = 0;
3906 struct extent_buffer *must_clean = NULL;
3907
3908 if (found_extent) { 4231 if (found_extent) {
3909 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3910 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3917 } 4240 }
3918 } 4241 }
3919 4242
3920 ret = pin_down_bytes(trans, root, path, bytenr,
3921 num_bytes, is_data, 0, &must_clean);
3922 if (ret > 0)
3923 mark_free = 1;
3924 BUG_ON(ret < 0);
3925 /*
3926 * it is going to be very rare for someone to be waiting
3927 * on the block we're freeing. del_items might need to
3928 * schedule, so rather than get fancy, just force it
3929 * to blocking here
3930 */
3931 if (must_clean)
3932 btrfs_set_lock_blocking(must_clean);
3933
3934 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3935 num_to_del); 4244 num_to_del);
3936 BUG_ON(ret); 4245 BUG_ON(ret);
3937 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3938 4247
3939 if (must_clean) {
3940 clean_tree_block(NULL, root, must_clean);
3941 btrfs_tree_unlock(must_clean);
3942 free_extent_buffer(must_clean);
3943 }
3944
3945 if (is_data) { 4248 if (is_data) {
3946 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3947 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3951 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3952 } 4255 }
3953 4256
3954 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3955 mark_free);
3956 BUG_ON(ret); 4258 BUG_ON(ret);
3957 } 4259 }
3958 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3960} 4262}
3961 4263
3962/* 4264/*
3963 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3964 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3965 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3966 * removes it from the tree. 4268 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3972 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3973 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3974 struct rb_node *node; 4276 struct rb_node *node;
3975 int ret; 4277 int ret = 0;
3976 4278
3977 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3978 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4024 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4025 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4026 4328
4027 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4028 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4029 head->must_insert_reserved); 4331 ret = 1;
4030 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4031 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4032 return 0; 4335 return ret;
4033out: 4336out:
4034 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4035 return 0; 4338 return 0;
4036} 4339}
4037 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 BUG_ON(block_rsv->space_info != cache->space_info);
4364
4365 if (btrfs_header_generation(buf) == trans->transid) {
4366 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4367 ret = check_ref_cleanup(trans, root, buf->start);
4368 if (!ret)
4369 goto pin;
4370 }
4371
4372 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4373 pin_down_extent(root, cache, buf->start, buf->len, 1);
4374 goto pin;
4375 }
4376
4377 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4378
4379 btrfs_add_free_space(cache, buf->start, buf->len);
4380 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4381 if (ret == -EAGAIN) {
4382 /* block group became read-only */
4383 update_reserved_bytes(cache, buf->len, 0, 1);
4384 goto out;
4385 }
4386
4387 ret = 1;
4388 spin_lock(&block_rsv->lock);
4389 if (block_rsv->reserved < block_rsv->size) {
4390 block_rsv->reserved += buf->len;
4391 ret = 0;
4392 }
4393 spin_unlock(&block_rsv->lock);
4394
4395 if (ret) {
4396 spin_lock(&cache->space_info->lock);
4397 cache->space_info->bytes_reserved -= buf->len;
4398 spin_unlock(&cache->space_info->lock);
4399 }
4400 goto out;
4401 }
4402pin:
4403 if (block_rsv->durable && !cache->ro) {
4404 ret = 0;
4405 spin_lock(&cache->lock);
4406 if (!cache->ro) {
4407 cache->reserved_pinned += buf->len;
4408 ret = 1;
4409 }
4410 spin_unlock(&cache->lock);
4411
4412 if (ret) {
4413 spin_lock(&block_rsv->lock);
4414 block_rsv->freed[trans->transid & 0x1] += buf->len;
4415 spin_unlock(&block_rsv->lock);
4416 }
4417 }
4418out:
4419 btrfs_put_block_group(cache);
4420}
4421
4038int btrfs_free_extent(struct btrfs_trans_handle *trans, 4422int btrfs_free_extent(struct btrfs_trans_handle *trans,
4039 struct btrfs_root *root, 4423 struct btrfs_root *root,
4040 u64 bytenr, u64 num_bytes, u64 parent, 4424 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4056 parent, root_objectid, (int)owner, 4440 parent, root_objectid, (int)owner,
4057 BTRFS_DROP_DELAYED_REF, NULL); 4441 BTRFS_DROP_DELAYED_REF, NULL);
4058 BUG_ON(ret); 4442 BUG_ON(ret);
4059 ret = check_ref_cleanup(trans, root, bytenr);
4060 BUG_ON(ret);
4061 } else { 4443 } else {
4062 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4444 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4063 parent, root_objectid, owner, 4445 parent, root_objectid, owner,
@@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4067 return ret; 4449 return ret;
4068} 4450}
4069 4451
4070int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4071 struct btrfs_root *root,
4072 u64 bytenr, u32 blocksize,
4073 u64 parent, u64 root_objectid, int level)
4074{
4075 u64 used;
4076 spin_lock(&root->node_lock);
4077 used = btrfs_root_used(&root->root_item) - blocksize;
4078 btrfs_set_root_used(&root->root_item, used);
4079 spin_unlock(&root->node_lock);
4080
4081 return btrfs_free_extent(trans, root, bytenr, blocksize,
4082 parent, root_objectid, level, 0);
4083}
4084
4085static u64 stripe_align(struct btrfs_root *root, u64 val) 4452static u64 stripe_align(struct btrfs_root *root, u64 val)
4086{ 4453{
4087 u64 mask = ((u64)root->stripesize - 1); 4454 u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4134 return 0; 4501 return 0;
4135} 4502}
4136 4503
4504static int get_block_group_index(struct btrfs_block_group_cache *cache)
4505{
4506 int index;
4507 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4508 index = 0;
4509 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4510 index = 1;
4511 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4512 index = 2;
4513 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4514 index = 3;
4515 else
4516 index = 4;
4517 return index;
4518}
4519
4137enum btrfs_loop_type { 4520enum btrfs_loop_type {
4138 LOOP_FIND_IDEAL = 0, 4521 LOOP_FIND_IDEAL = 0,
4139 LOOP_CACHING_NOWAIT = 1, 4522 LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4155 u64 num_bytes, u64 empty_size, 4538 u64 num_bytes, u64 empty_size,
4156 u64 search_start, u64 search_end, 4539 u64 search_start, u64 search_end,
4157 u64 hint_byte, struct btrfs_key *ins, 4540 u64 hint_byte, struct btrfs_key *ins,
4158 u64 exclude_start, u64 exclude_nr,
4159 int data) 4541 int data)
4160{ 4542{
4161 int ret = 0; 4543 int ret = 0;
@@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4168 struct btrfs_space_info *space_info; 4550 struct btrfs_space_info *space_info;
4169 int last_ptr_loop = 0; 4551 int last_ptr_loop = 0;
4170 int loop = 0; 4552 int loop = 0;
4553 int index = 0;
4171 bool found_uncached_bg = false; 4554 bool found_uncached_bg = false;
4172 bool failed_cluster_refill = false; 4555 bool failed_cluster_refill = false;
4173 bool failed_alloc = false; 4556 bool failed_alloc = false;
@@ -4237,6 +4620,7 @@ ideal_cache:
4237 btrfs_put_block_group(block_group); 4620 btrfs_put_block_group(block_group);
4238 up_read(&space_info->groups_sem); 4621 up_read(&space_info->groups_sem);
4239 } else { 4622 } else {
4623 index = get_block_group_index(block_group);
4240 goto have_block_group; 4624 goto have_block_group;
4241 } 4625 }
4242 } else if (block_group) { 4626 } else if (block_group) {
@@ -4245,7 +4629,8 @@ ideal_cache:
4245 } 4629 }
4246search: 4630search:
4247 down_read(&space_info->groups_sem); 4631 down_read(&space_info->groups_sem);
4248 list_for_each_entry(block_group, &space_info->block_groups, list) { 4632 list_for_each_entry(block_group, &space_info->block_groups[index],
4633 list) {
4249 u64 offset; 4634 u64 offset;
4250 int cached; 4635 int cached;
4251 4636
@@ -4436,23 +4821,22 @@ checks:
4436 goto loop; 4821 goto loop;
4437 } 4822 }
4438 4823
4439 if (exclude_nr > 0 && 4824 ins->objectid = search_start;
4440 (search_start + num_bytes > exclude_start && 4825 ins->offset = num_bytes;
4441 search_start < exclude_start + exclude_nr)) { 4826
4442 search_start = exclude_start + exclude_nr; 4827 if (offset < search_start)
4828 btrfs_add_free_space(block_group, offset,
4829 search_start - offset);
4830 BUG_ON(offset > search_start);
4443 4831
4832 ret = update_reserved_bytes(block_group, num_bytes, 1,
4833 (data & BTRFS_BLOCK_GROUP_DATA));
4834 if (ret == -EAGAIN) {
4444 btrfs_add_free_space(block_group, offset, num_bytes); 4835 btrfs_add_free_space(block_group, offset, num_bytes);
4445 /*
4446 * if search_start is still in this block group
4447 * then we just re-search this block group
4448 */
4449 if (search_start >= block_group->key.objectid &&
4450 search_start < (block_group->key.objectid +
4451 block_group->key.offset))
4452 goto have_block_group;
4453 goto loop; 4836 goto loop;
4454 } 4837 }
4455 4838
4839 /* we are all good, lets return */
4456 ins->objectid = search_start; 4840 ins->objectid = search_start;
4457 ins->offset = num_bytes; 4841 ins->offset = num_bytes;
4458 4842
@@ -4460,18 +4844,18 @@ checks:
4460 btrfs_add_free_space(block_group, offset, 4844 btrfs_add_free_space(block_group, offset,
4461 search_start - offset); 4845 search_start - offset);
4462 BUG_ON(offset > search_start); 4846 BUG_ON(offset > search_start);
4463
4464 update_reserved_extents(block_group, num_bytes, 1);
4465
4466 /* we are all good, lets return */
4467 break; 4847 break;
4468loop: 4848loop:
4469 failed_cluster_refill = false; 4849 failed_cluster_refill = false;
4470 failed_alloc = false; 4850 failed_alloc = false;
4851 BUG_ON(index != get_block_group_index(block_group));
4471 btrfs_put_block_group(block_group); 4852 btrfs_put_block_group(block_group);
4472 } 4853 }
4473 up_read(&space_info->groups_sem); 4854 up_read(&space_info->groups_sem);
4474 4855
4856 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4857 goto search;
4858
4475 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4859 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4476 * for them to make caching progress. Also 4860 * for them to make caching progress. Also
4477 * determine the best possible bg to cache 4861 * determine the best possible bg to cache
@@ -4485,6 +4869,7 @@ loop:
4485 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4869 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4486 (found_uncached_bg || empty_size || empty_cluster || 4870 (found_uncached_bg || empty_size || empty_cluster ||
4487 allowed_chunk_alloc)) { 4871 allowed_chunk_alloc)) {
4872 index = 0;
4488 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4873 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4489 found_uncached_bg = false; 4874 found_uncached_bg = false;
4490 loop++; 4875 loop++;
@@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4567 int dump_block_groups) 4952 int dump_block_groups)
4568{ 4953{
4569 struct btrfs_block_group_cache *cache; 4954 struct btrfs_block_group_cache *cache;
4955 int index = 0;
4570 4956
4571 spin_lock(&info->lock); 4957 spin_lock(&info->lock);
4572 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4958 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4573 (unsigned long long)(info->total_bytes - info->bytes_used - 4959 (unsigned long long)(info->total_bytes - info->bytes_used -
4574 info->bytes_pinned - info->bytes_reserved - 4960 info->bytes_pinned - info->bytes_reserved -
4575 info->bytes_super), 4961 info->bytes_readonly),
4576 (info->full) ? "" : "not "); 4962 (info->full) ? "" : "not ");
4577 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4963 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4578 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4964 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4579 "\n",
4580 (unsigned long long)info->total_bytes, 4965 (unsigned long long)info->total_bytes,
4966 (unsigned long long)info->bytes_used,
4581 (unsigned long long)info->bytes_pinned, 4967 (unsigned long long)info->bytes_pinned,
4582 (unsigned long long)info->bytes_delalloc, 4968 (unsigned long long)info->bytes_reserved,
4583 (unsigned long long)info->bytes_may_use, 4969 (unsigned long long)info->bytes_may_use,
4584 (unsigned long long)info->bytes_used, 4970 (unsigned long long)info->bytes_readonly);
4585 (unsigned long long)info->bytes_root,
4586 (unsigned long long)info->bytes_super,
4587 (unsigned long long)info->bytes_reserved);
4588 spin_unlock(&info->lock); 4971 spin_unlock(&info->lock);
4589 4972
4590 if (!dump_block_groups) 4973 if (!dump_block_groups)
4591 return; 4974 return;
4592 4975
4593 down_read(&info->groups_sem); 4976 down_read(&info->groups_sem);
4594 list_for_each_entry(cache, &info->block_groups, list) { 4977again:
4978 list_for_each_entry(cache, &info->block_groups[index], list) {
4595 spin_lock(&cache->lock); 4979 spin_lock(&cache->lock);
4596 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4980 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4597 "%llu pinned %llu reserved\n", 4981 "%llu pinned %llu reserved\n",
@@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4603 btrfs_dump_free_space(cache, bytes); 4987 btrfs_dump_free_space(cache, bytes);
4604 spin_unlock(&cache->lock); 4988 spin_unlock(&cache->lock);
4605 } 4989 }
4990 if (++index < BTRFS_NR_RAID_TYPES)
4991 goto again;
4606 up_read(&info->groups_sem); 4992 up_read(&info->groups_sem);
4607} 4993}
4608 4994
@@ -4628,9 +5014,8 @@ again:
4628 5014
4629 WARN_ON(num_bytes < root->sectorsize); 5015 WARN_ON(num_bytes < root->sectorsize);
4630 ret = find_free_extent(trans, root, num_bytes, empty_size, 5016 ret = find_free_extent(trans, root, num_bytes, empty_size,
4631 search_start, search_end, hint_byte, ins, 5017 search_start, search_end, hint_byte,
4632 trans->alloc_exclude_start, 5018 ins, data);
4633 trans->alloc_exclude_nr, data);
4634 5019
4635 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5020 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4636 num_bytes = num_bytes >> 1; 5021 num_bytes = num_bytes >> 1;
@@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4668 ret = btrfs_discard_extent(root, start, len); 5053 ret = btrfs_discard_extent(root, start, len);
4669 5054
4670 btrfs_add_free_space(cache, start, len); 5055 btrfs_add_free_space(cache, start, len);
4671 update_reserved_extents(cache, len, 0); 5056 update_reserved_bytes(cache, len, 0, 1);
4672 btrfs_put_block_group(cache); 5057 btrfs_put_block_group(cache);
4673 5058
4674 return ret; 5059 return ret;
@@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4731 btrfs_mark_buffer_dirty(path->nodes[0]); 5116 btrfs_mark_buffer_dirty(path->nodes[0]);
4732 btrfs_free_path(path); 5117 btrfs_free_path(path);
4733 5118
4734 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5119 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4735 1, 0);
4736 if (ret) { 5120 if (ret) {
4737 printk(KERN_ERR "btrfs update block group failed for %llu " 5121 printk(KERN_ERR "btrfs update block group failed for %llu "
4738 "%llu\n", (unsigned long long)ins->objectid, 5122 "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4792 btrfs_mark_buffer_dirty(leaf); 5176 btrfs_mark_buffer_dirty(leaf);
4793 btrfs_free_path(path); 5177 btrfs_free_path(path);
4794 5178
4795 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5179 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4796 1, 0);
4797 if (ret) { 5180 if (ret) {
4798 printk(KERN_ERR "btrfs update block group failed for %llu " 5181 printk(KERN_ERR "btrfs update block group failed for %llu "
4799 "%llu\n", (unsigned long long)ins->objectid, 5182 "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4869 put_caching_control(caching_ctl); 5252 put_caching_control(caching_ctl);
4870 } 5253 }
4871 5254
4872 update_reserved_extents(block_group, ins->offset, 1); 5255 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5256 BUG_ON(ret);
4873 btrfs_put_block_group(block_group); 5257 btrfs_put_block_group(block_group);
4874 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5258 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4875 0, owner, offset, ins, 1); 5259 0, owner, offset, ins, 1);
4876 return ret; 5260 return ret;
4877} 5261}
4878 5262
4879/*
4880 * finds a free extent and does all the dirty work required for allocation
4881 * returns the key for the extent through ins, and a tree buffer for
4882 * the first block of the extent through buf.
4883 *
4884 * returns 0 if everything worked, non-zero otherwise.
4885 */
4886static int alloc_tree_block(struct btrfs_trans_handle *trans,
4887 struct btrfs_root *root,
4888 u64 num_bytes, u64 parent, u64 root_objectid,
4889 struct btrfs_disk_key *key, int level,
4890 u64 empty_size, u64 hint_byte, u64 search_end,
4891 struct btrfs_key *ins)
4892{
4893 int ret;
4894 u64 flags = 0;
4895
4896 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4897 empty_size, hint_byte, search_end,
4898 ins, 0);
4899 if (ret)
4900 return ret;
4901
4902 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4903 if (parent == 0)
4904 parent = ins->objectid;
4905 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4906 } else
4907 BUG_ON(parent > 0);
4908
4909 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4910 struct btrfs_delayed_extent_op *extent_op;
4911 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4912 BUG_ON(!extent_op);
4913 if (key)
4914 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4915 else
4916 memset(&extent_op->key, 0, sizeof(extent_op->key));
4917 extent_op->flags_to_set = flags;
4918 extent_op->update_key = 1;
4919 extent_op->update_flags = 1;
4920 extent_op->is_data = 0;
4921
4922 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4923 ins->offset, parent, root_objectid,
4924 level, BTRFS_ADD_DELAYED_EXTENT,
4925 extent_op);
4926 BUG_ON(ret);
4927 }
4928
4929 if (root_objectid == root->root_key.objectid) {
4930 u64 used;
4931 spin_lock(&root->node_lock);
4932 used = btrfs_root_used(&root->root_item) + num_bytes;
4933 btrfs_set_root_used(&root->root_item, used);
4934 spin_unlock(&root->node_lock);
4935 }
4936 return ret;
4937}
4938
4939struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5263struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4940 struct btrfs_root *root, 5264 struct btrfs_root *root,
4941 u64 bytenr, u32 blocksize, 5265 u64 bytenr, u32 blocksize,
@@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4974 return buf; 5298 return buf;
4975} 5299}
4976 5300
5301static struct btrfs_block_rsv *
5302use_block_rsv(struct btrfs_trans_handle *trans,
5303 struct btrfs_root *root, u32 blocksize)
5304{
5305 struct btrfs_block_rsv *block_rsv;
5306 int ret;
5307
5308 block_rsv = get_block_rsv(trans, root);
5309
5310 if (block_rsv->size == 0) {
5311 ret = reserve_metadata_bytes(block_rsv, blocksize);
5312 if (ret)
5313 return ERR_PTR(ret);
5314 return block_rsv;
5315 }
5316
5317 ret = block_rsv_use_bytes(block_rsv, blocksize);
5318 if (!ret)
5319 return block_rsv;
5320
5321 WARN_ON(1);
5322 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5323 block_rsv->size, block_rsv->reserved,
5324 block_rsv->freed[0], block_rsv->freed[1]);
5325
5326 return ERR_PTR(-ENOSPC);
5327}
5328
5329static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5330{
5331 block_rsv_add_bytes(block_rsv, blocksize, 0);
5332 block_rsv_release_bytes(block_rsv, NULL, 0);
5333}
5334
4977/* 5335/*
4978 * helper function to allocate a block for a given tree 5336 * finds a free extent and does all the dirty work required for allocation
5337 * returns the key for the extent through ins, and a tree buffer for
5338 * the first block of the extent through buf.
5339 *
4979 * returns the tree buffer or NULL. 5340 * returns the tree buffer or NULL.
4980 */ 5341 */
4981struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5342struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4985 u64 hint, u64 empty_size) 5346 u64 hint, u64 empty_size)
4986{ 5347{
4987 struct btrfs_key ins; 5348 struct btrfs_key ins;
4988 int ret; 5349 struct btrfs_block_rsv *block_rsv;
4989 struct extent_buffer *buf; 5350 struct extent_buffer *buf;
5351 u64 flags = 0;
5352 int ret;
5353
4990 5354
4991 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5355 block_rsv = use_block_rsv(trans, root, blocksize);
4992 key, level, empty_size, hint, (u64)-1, &ins); 5356 if (IS_ERR(block_rsv))
5357 return ERR_CAST(block_rsv);
5358
5359 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5360 empty_size, hint, (u64)-1, &ins, 0);
4993 if (ret) { 5361 if (ret) {
4994 BUG_ON(ret > 0); 5362 unuse_block_rsv(block_rsv, blocksize);
4995 return ERR_PTR(ret); 5363 return ERR_PTR(ret);
4996 } 5364 }
4997 5365
4998 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5366 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4999 blocksize, level); 5367 blocksize, level);
5368 BUG_ON(IS_ERR(buf));
5369
5370 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5371 if (parent == 0)
5372 parent = ins.objectid;
5373 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5374 } else
5375 BUG_ON(parent > 0);
5376
5377 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5378 struct btrfs_delayed_extent_op *extent_op;
5379 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5380 BUG_ON(!extent_op);
5381 if (key)
5382 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5383 else
5384 memset(&extent_op->key, 0, sizeof(extent_op->key));
5385 extent_op->flags_to_set = flags;
5386 extent_op->update_key = 1;
5387 extent_op->update_flags = 1;
5388 extent_op->is_data = 0;
5389
5390 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5391 ins.offset, parent, root_objectid,
5392 level, BTRFS_ADD_DELAYED_EXTENT,
5393 extent_op);
5394 BUG_ON(ret);
5395 }
5000 return buf; 5396 return buf;
5001} 5397}
5002 5398
@@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5321 struct btrfs_path *path, 5717 struct btrfs_path *path,
5322 struct walk_control *wc) 5718 struct walk_control *wc)
5323{ 5719{
5324 int ret = 0; 5720 int ret;
5325 int level = wc->level; 5721 int level = wc->level;
5326 struct extent_buffer *eb = path->nodes[level]; 5722 struct extent_buffer *eb = path->nodes[level];
5327 u64 parent = 0; 5723 u64 parent = 0;
@@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5399 btrfs_header_owner(path->nodes[level + 1])); 5795 btrfs_header_owner(path->nodes[level + 1]));
5400 } 5796 }
5401 5797
5402 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5798 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5403 root->root_key.objectid, level, 0);
5404 BUG_ON(ret);
5405out: 5799out:
5406 wc->refs[level] = 0; 5800 wc->refs[level] = 0;
5407 wc->flags[level] = 0; 5801 wc->flags[level] = 0;
5408 return ret; 5802 return 0;
5409} 5803}
5410 5804
5411static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5805static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5483 * also make sure backrefs for the shared block and all lower level 5877 * also make sure backrefs for the shared block and all lower level
5484 * blocks are properly updated. 5878 * blocks are properly updated.
5485 */ 5879 */
5486int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5880int btrfs_drop_snapshot(struct btrfs_root *root,
5881 struct btrfs_block_rsv *block_rsv, int update_ref)
5487{ 5882{
5488 struct btrfs_path *path; 5883 struct btrfs_path *path;
5489 struct btrfs_trans_handle *trans; 5884 struct btrfs_trans_handle *trans;
@@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5501 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5896 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5502 BUG_ON(!wc); 5897 BUG_ON(!wc);
5503 5898
5504 trans = btrfs_start_transaction(tree_root, 1); 5899 trans = btrfs_start_transaction(tree_root, 0);
5900 if (block_rsv)
5901 trans->block_rsv = block_rsv;
5505 5902
5506 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5903 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5507 level = btrfs_header_level(root->node); 5904 level = btrfs_header_level(root->node);
@@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5589 } 5986 }
5590 5987
5591 BUG_ON(wc->level == 0); 5988 BUG_ON(wc->level == 0);
5592 if (trans->transaction->in_commit || 5989 if (btrfs_should_end_transaction(trans, tree_root)) {
5593 trans->transaction->delayed_refs.flushing) {
5594 ret = btrfs_update_root(trans, tree_root, 5990 ret = btrfs_update_root(trans, tree_root,
5595 &root->root_key, 5991 &root->root_key,
5596 root_item); 5992 root_item);
5597 BUG_ON(ret); 5993 BUG_ON(ret);
5598 5994
5599 btrfs_end_transaction(trans, tree_root); 5995 btrfs_end_transaction_throttle(trans, tree_root);
5600 trans = btrfs_start_transaction(tree_root, 1); 5996 trans = btrfs_start_transaction(tree_root, 0);
5601 } else { 5997 if (block_rsv)
5602 unsigned long update; 5998 trans->block_rsv = block_rsv;
5603 update = trans->delayed_ref_updates;
5604 trans->delayed_ref_updates = 0;
5605 if (update)
5606 btrfs_run_delayed_refs(trans, tree_root,
5607 update);
5608 } 5999 }
5609 } 6000 }
5610 btrfs_release_path(root, path); 6001 btrfs_release_path(root, path);
@@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5632 kfree(root); 6023 kfree(root);
5633 } 6024 }
5634out: 6025out:
5635 btrfs_end_transaction(trans, tree_root); 6026 btrfs_end_transaction_throttle(trans, tree_root);
5636 kfree(wc); 6027 kfree(wc);
5637 btrfs_free_path(path); 6028 btrfs_free_path(path);
5638 return err; 6029 return err;
@@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7228 return flags; 7619 return flags;
7229} 7620}
7230 7621
7231static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7622static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7232 struct btrfs_block_group_cache *shrink_block_group,
7233 int force)
7234{ 7623{
7235 struct btrfs_trans_handle *trans; 7624 struct btrfs_space_info *sinfo = cache->space_info;
7236 u64 new_alloc_flags; 7625 u64 num_bytes;
7237 u64 calc; 7626 int ret = -ENOSPC;
7238 7627
7239 spin_lock(&shrink_block_group->lock); 7628 if (cache->ro)
7240 if (btrfs_block_group_used(&shrink_block_group->item) + 7629 return 0;
7241 shrink_block_group->reserved > 0) {
7242 spin_unlock(&shrink_block_group->lock);
7243 7630
7244 trans = btrfs_start_transaction(root, 1); 7631 spin_lock(&sinfo->lock);
7245 spin_lock(&shrink_block_group->lock); 7632 spin_lock(&cache->lock);
7633 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7634 cache->bytes_super - btrfs_block_group_used(&cache->item);
7635
7636 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7637 sinfo->bytes_may_use + sinfo->bytes_readonly +
7638 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7639 sinfo->bytes_readonly += num_bytes;
7640 sinfo->bytes_reserved += cache->reserved_pinned;
7641 cache->reserved_pinned = 0;
7642 cache->ro = 1;
7643 ret = 0;
7644 }
7645 spin_unlock(&cache->lock);
7646 spin_unlock(&sinfo->lock);
7647 return ret;
7648}
7246 7649
7247 new_alloc_flags = update_block_group_flags(root, 7650int btrfs_set_block_group_ro(struct btrfs_root *root,
7248 shrink_block_group->flags); 7651 struct btrfs_block_group_cache *cache)
7249 if (new_alloc_flags != shrink_block_group->flags) {
7250 calc =
7251 btrfs_block_group_used(&shrink_block_group->item);
7252 } else {
7253 calc = shrink_block_group->key.offset;
7254 }
7255 spin_unlock(&shrink_block_group->lock);
7256 7652
7257 do_chunk_alloc(trans, root->fs_info->extent_root, 7653{
7258 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7654 struct btrfs_trans_handle *trans;
7655 u64 alloc_flags;
7656 int ret;
7259 7657
7260 btrfs_end_transaction(trans, root); 7658 BUG_ON(cache->ro);
7261 } else 7659
7262 spin_unlock(&shrink_block_group->lock); 7660 trans = btrfs_join_transaction(root, 1);
7263 return 0; 7661 BUG_ON(IS_ERR(trans));
7264}
7265 7662
7663 alloc_flags = update_block_group_flags(root, cache->flags);
7664 if (alloc_flags != cache->flags)
7665 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7266 7666
7267int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7667 ret = set_block_group_ro(cache);
7268 struct btrfs_block_group_cache *group) 7668 if (!ret)
7669 goto out;
7670 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7671 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7672 if (ret < 0)
7673 goto out;
7674 ret = set_block_group_ro(cache);
7675out:
7676 btrfs_end_transaction(trans, root);
7677 return ret;
7678}
7269 7679
7680int btrfs_set_block_group_rw(struct btrfs_root *root,
7681 struct btrfs_block_group_cache *cache)
7270{ 7682{
7271 __alloc_chunk_for_shrink(root, group, 1); 7683 struct btrfs_space_info *sinfo = cache->space_info;
7272 set_block_group_readonly(group); 7684 u64 num_bytes;
7685
7686 BUG_ON(!cache->ro);
7687
7688 spin_lock(&sinfo->lock);
7689 spin_lock(&cache->lock);
7690 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7691 cache->bytes_super - btrfs_block_group_used(&cache->item);
7692 sinfo->bytes_readonly -= num_bytes;
7693 cache->ro = 0;
7694 spin_unlock(&cache->lock);
7695 spin_unlock(&sinfo->lock);
7273 return 0; 7696 return 0;
7274} 7697}
7275 7698
@@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7436 */ 7859 */
7437 synchronize_rcu(); 7860 synchronize_rcu();
7438 7861
7862 release_global_block_rsv(info);
7863
7439 while(!list_empty(&info->space_info)) { 7864 while(!list_empty(&info->space_info)) {
7440 space_info = list_entry(info->space_info.next, 7865 space_info = list_entry(info->space_info.next,
7441 struct btrfs_space_info, 7866 struct btrfs_space_info,
7442 list); 7867 list);
7443 7868 if (space_info->bytes_pinned > 0 ||
7869 space_info->bytes_reserved > 0) {
7870 WARN_ON(1);
7871 dump_space_info(space_info, 0, 0);
7872 }
7444 list_del(&space_info->list); 7873 list_del(&space_info->list);
7445 kfree(space_info); 7874 kfree(space_info);
7446 } 7875 }
7447 return 0; 7876 return 0;
7448} 7877}
7449 7878
7879static void __link_block_group(struct btrfs_space_info *space_info,
7880 struct btrfs_block_group_cache *cache)
7881{
7882 int index = get_block_group_index(cache);
7883
7884 down_write(&space_info->groups_sem);
7885 list_add_tail(&cache->list, &space_info->block_groups[index]);
7886 up_write(&space_info->groups_sem);
7887}
7888
7450int btrfs_read_block_groups(struct btrfs_root *root) 7889int btrfs_read_block_groups(struct btrfs_root *root)
7451{ 7890{
7452 struct btrfs_path *path; 7891 struct btrfs_path *path;
@@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7468 7907
7469 while (1) { 7908 while (1) {
7470 ret = find_first_block_group(root, path, &key); 7909 ret = find_first_block_group(root, path, &key);
7471 if (ret > 0) { 7910 if (ret > 0)
7472 ret = 0; 7911 break;
7473 goto error;
7474 }
7475 if (ret != 0) 7912 if (ret != 0)
7476 goto error; 7913 goto error;
7477 7914
@@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7480 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7917 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7481 if (!cache) { 7918 if (!cache) {
7482 ret = -ENOMEM; 7919 ret = -ENOMEM;
7483 break; 7920 goto error;
7484 } 7921 }
7485 7922
7486 atomic_set(&cache->count, 1); 7923 atomic_set(&cache->count, 1);
@@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7537 BUG_ON(ret); 7974 BUG_ON(ret);
7538 cache->space_info = space_info; 7975 cache->space_info = space_info;
7539 spin_lock(&cache->space_info->lock); 7976 spin_lock(&cache->space_info->lock);
7540 cache->space_info->bytes_super += cache->bytes_super; 7977 cache->space_info->bytes_readonly += cache->bytes_super;
7541 spin_unlock(&cache->space_info->lock); 7978 spin_unlock(&cache->space_info->lock);
7542 7979
7543 down_write(&space_info->groups_sem); 7980 __link_block_group(space_info, cache);
7544 list_add_tail(&cache->list, &space_info->block_groups);
7545 up_write(&space_info->groups_sem);
7546 7981
7547 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7982 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7548 BUG_ON(ret); 7983 BUG_ON(ret);
7549 7984
7550 set_avail_alloc_bits(root->fs_info, cache->flags); 7985 set_avail_alloc_bits(root->fs_info, cache->flags);
7551 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7986 if (btrfs_chunk_readonly(root, cache->key.objectid))
7552 set_block_group_readonly(cache); 7987 set_block_group_ro(cache);
7553 } 7988 }
7989
7990 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7991 if (!(get_alloc_profile(root, space_info->flags) &
7992 (BTRFS_BLOCK_GROUP_RAID10 |
7993 BTRFS_BLOCK_GROUP_RAID1 |
7994 BTRFS_BLOCK_GROUP_DUP)))
7995 continue;
7996 /*
7997 * avoid allocating from un-mirrored block group if there are
7998 * mirrored block groups.
7999 */
8000 list_for_each_entry(cache, &space_info->block_groups[3], list)
8001 set_block_group_ro(cache);
8002 list_for_each_entry(cache, &space_info->block_groups[4], list)
8003 set_block_group_ro(cache);
8004 }
8005
8006 init_global_block_rsv(info);
7554 ret = 0; 8007 ret = 0;
7555error: 8008error:
7556 btrfs_free_path(path); 8009 btrfs_free_path(path);
@@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7611 BUG_ON(ret); 8064 BUG_ON(ret);
7612 8065
7613 spin_lock(&cache->space_info->lock); 8066 spin_lock(&cache->space_info->lock);
7614 cache->space_info->bytes_super += cache->bytes_super; 8067 cache->space_info->bytes_readonly += cache->bytes_super;
7615 spin_unlock(&cache->space_info->lock); 8068 spin_unlock(&cache->space_info->lock);
7616 8069
7617 down_write(&cache->space_info->groups_sem); 8070 __link_block_group(cache->space_info, cache);
7618 list_add_tail(&cache->list, &cache->space_info->block_groups);
7619 up_write(&cache->space_info->groups_sem);
7620 8071
7621 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8072 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7622 BUG_ON(ret); 8073 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
135 return state; 135 return state;
136} 136}
137 137
138static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
139{ 139{
140 if (!state) 140 if (!state)
141 return; 141 return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
335} 335}
336 336
337static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
338 struct extent_state *state, 338 struct extent_state *state, int *bits)
339 unsigned long bits)
340{ 339{
341 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
342 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
343 state->start, state->end, 342 state, bits);
344 state->state, bits);
345 } 343 }
346 344
347 return 0; 345 return 0;
348} 346}
349 347
350static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
351 struct extent_state *state, 349 struct extent_state *state, int *bits)
352 unsigned long bits)
353{ 350{
354 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
355 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
367 */ 364 */
368static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
369 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
370 int bits) 367 int *bits)
371{ 368{
372 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
373 int ret; 371 int ret;
374 372
375 if (end < start) { 373 if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
384 if (ret) 382 if (ret)
385 return ret; 383 return ret;
386 384
387 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
388 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
389 state->state |= bits; 387 state->state |= bits_to_set;
390 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
391 if (node) { 389 if (node) {
392 struct extent_state *found; 390 struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
456 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
457 */ 455 */
458static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
459 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
460 int delete) 458 int *bits, int wake)
461{ 459{
462 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
463 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
464 462
465 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
467 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
468 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
472 if (wake) 470 if (wake)
473 wake_up(&state->wq); 471 wake_up(&state->wq);
474 if (delete || state->state == 0) { 472 if (state->state == 0) {
475 if (state->tree) { 473 if (state->tree) {
476 clear_state_cb(tree, state, state->state);
477 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
478 state->tree = NULL; 475 state->tree = NULL;
479 free_extent_state(state); 476 free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
514 int set = 0; 511 int set = 0;
515 int clear = 0; 512 int clear = 0;
516 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1; 519 clear = 1;
519again: 520again:
@@ -580,8 +581,7 @@ hit_next:
580 if (err) 581 if (err)
581 goto out; 582 goto out;
582 if (state->end <= end) { 583 if (state->end <= end) {
583 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
584 delete);
585 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
586 goto out; 586 goto out;
587 start = last_end + 1; 587 start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
602 if (wake) 602 if (wake)
603 wake_up(&state->wq); 603 wake_up(&state->wq);
604 604
605 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
606 606
607 prealloc = NULL; 607 prealloc = NULL;
608 goto out; 608 goto out;
@@ -613,7 +613,7 @@ hit_next:
613 else 613 else
614 next_node = NULL; 614 next_node = NULL;
615 615
616 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
617 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
618 goto out; 618 goto out;
619 start = last_end + 1; 619 start = last_end + 1;
@@ -706,19 +706,19 @@ out:
706 706
707static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
708 struct extent_state *state, 708 struct extent_state *state,
709 int bits) 709 int *bits)
710{ 710{
711 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
712 713
713 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
714 if (ret) 715 if (ret)
715 return ret; 716 return ret;
716 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
717 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
719 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
720 } 720 }
721 state->state |= bits; 721 state->state |= bits_to_set;
722 722
723 return 0; 723 return 0;
724} 724}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
745 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
746 */ 746 */
747 747
748static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
750 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
751 gfp_t mask)
752{ 751{
753 struct extent_state *state; 752 struct extent_state *state;
754 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
757 u64 last_start; 756 u64 last_start;
758 u64 last_end; 757 u64 last_end;
759 758
759 bits |= EXTENT_FIRST_DELALLOC;
760again: 760again:
761 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
762 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
778 */ 778 */
779 node = tree_search(tree, start); 779 node = tree_search(tree, start);
780 if (!node) { 780 if (!node) {
781 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
782 prealloc = NULL; 782 prealloc = NULL;
783 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
784 goto out; 784 goto out;
@@ -802,7 +802,7 @@ hit_next:
802 goto out; 802 goto out;
803 } 803 }
804 804
805 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
806 if (err) 806 if (err)
807 goto out; 807 goto out;
808 808
@@ -852,7 +852,7 @@ hit_next:
852 if (err) 852 if (err)
853 goto out; 853 goto out;
854 if (state->end <= end) { 854 if (state->end <= end) {
855 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
856 if (err) 856 if (err)
857 goto out; 857 goto out;
858 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
877 else 877 else
878 this_end = last_start - 1; 878 this_end = last_start - 1;
879 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
880 bits); 880 &bits);
881 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
882 if (err) { 882 if (err) {
883 prealloc = NULL; 883 prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
903 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
904 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
905 905
906 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
907 if (err) { 907 if (err) {
908 prealloc = NULL; 908 prealloc = NULL;
909 goto out; 909 goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
966{ 966{
967 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
968 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
969 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
970 NULL, mask);
971} 970}
972 971
973int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1435 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1436 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1437 1436
1438 if (op & EXTENT_CLEAR_ACCOUNTING)
1439 clear_bits |= EXTENT_DO_ACCOUNTING;
1440
1441 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1442 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1916 1912
1917 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1918 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1919 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1920 else 1916 else
1921 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1922 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2020 sector_t sector; 2016 sector_t sector;
2021 struct extent_map *em; 2017 struct extent_map *em;
2022 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2023 int ret; 2020 int ret;
2024 int nr = 0; 2021 int nr = 0;
2025 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2031 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2032 2029
2033 end = page_end; 2030 end = page_end;
2034 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2035 2040
2036 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2037 char *userpage; 2042 char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
149} 149}
150 150
151 151
152int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
153 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
154{ 155{
155 u32 sum; 156 u32 sum;
156 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
157 int bio_index = 0; 158 int bio_index = 0;
158 u64 offset; 159 u64 offset = 0;
159 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
160 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
161 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
174 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
175 176
176 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
177 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
178 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
179 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
180 if (ret == 0) 184 if (ret == 0)
181 goto found; 185 goto found;
@@ -238,6 +242,7 @@ found:
238 else 242 else
239 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
240 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
241 bio_index++; 246 bio_index++;
242 bvec++; 247 bvec++;
243 } 248 }
@@ -245,6 +250,18 @@ found:
245 return 0; 250 return 0;
246} 251}
247 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
248int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
249 struct list_head *list) 266 struct list_head *list)
250{ 267{
@@ -657,6 +674,9 @@ again:
657 goto found; 674 goto found;
658 } 675 }
659 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
660 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
661 u32 item_size; 681 u32 item_size;
662 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..79437c5eeb1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 47 int write_bytes,
48 struct page **prepared_pages, 48 struct page **prepared_pages,
49 const char __user *buf) 49 struct iov_iter *i)
50{ 50{
51 long page_fault = 0; 51 size_t copied;
52 int i; 52 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 54
55 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
59 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
60 62
61 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
62 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
63 page_fault = __copy_from_user(page_address(page) + offset, 65
64 buf, count);
65 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
66 flush_dcache_page(page); 67 flush_dcache_page(page);
67 kunmap(page); 68 iov_iter_advance(i, copied);
68 buf += count; 69 write_bytes -= copied;
69 write_bytes -= count;
70 70
71 if (page_fault) 71 if (unlikely(copied == 0)) {
72 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
73 } 83 }
74 return page_fault ? -EFAULT : 0; 84 return 0;
75} 85}
76 86
77/* 87/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
126 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL); 138 NULL);
129 if (err) 139 BUG_ON(err);
130 return err;
131 140
132 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
133 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
142 * at this time. 151 * at this time.
143 */ 152 */
144 } 153 }
145 return err; 154 return 0;
146} 155}
147 156
148/* 157/*
@@ -823,45 +832,46 @@ again:
823 return 0; 832 return 0;
824} 833}
825 834
826static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
827 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
828{ 838{
829 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
830 loff_t start_pos; 846 loff_t start_pos;
831 ssize_t num_written = 0; 847 ssize_t num_written = 0;
832 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
833 int ret = 0; 851 int ret = 0;
834 struct inode *inode = fdentry(file)->d_inode;
835 struct btrfs_root *root = BTRFS_I(inode)->root;
836 struct page **pages = NULL;
837 int nrptrs; 852 int nrptrs;
838 struct page *pinned[2];
839 unsigned long first_index; 853 unsigned long first_index;
840 unsigned long last_index; 854 unsigned long last_index;
841 int will_write; 855 int will_write;
856 int buffered = 0;
842 857
843 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
844 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
845 860
846 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
847 PAGE_CACHE_SIZE / (sizeof(struct page *)));
848 pinned[0] = NULL; 861 pinned[0] = NULL;
849 pinned[1] = NULL; 862 pinned[1] = NULL;
850 863
851 pos = *ppos;
852 start_pos = pos; 864 start_pos = pos;
853 865
854 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
855 867
856 /* do the reserve before the mutex lock in case we have to do some
857 * flushing. We wouldn't deadlock, but this is more polite.
858 */
859 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
860 if (err)
861 goto out_nolock;
862
863 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
864 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
865 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
866 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
867 if (err) 877 if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
875 goto out; 885 goto out;
876 886
877 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
878 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
879 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
880 929
881 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
882 start_pos = pos; 931 start_pos = pos;
883 932
884 BTRFS_I(inode)->sequence++;
885 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
886 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
887 935
888 /* 936 /*
889 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
900 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
901 } 949 }
902 } 950 }
903 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
904 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
905 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
906 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
911 } 959 }
912 } 960 }
913 961
914 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
915 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
916 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
917 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
918 offset); 966 offset);
919 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
920 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
923 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
924 972
925 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
926 if (ret) 974 if (ret)
927 goto out; 975 goto out;
928 976
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
930 pos, first_index, last_index, 978 pos, first_index, last_index,
931 write_bytes); 979 write_bytes);
932 if (ret) { 980 if (ret) {
933 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
934 write_bytes);
935 goto out; 982 goto out;
936 } 983 }
937 984
938 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
939 write_bytes, pages, buf); 986 write_bytes, pages, &i);
940 if (ret) { 987 if (ret == 0) {
941 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
942 write_bytes); 989 num_pages, pos, write_bytes);
943 btrfs_drop_pages(pages, num_pages);
944 goto out;
945 } 990 }
946 991
947 ret = dirty_and_release_pages(NULL, root, file, pages,
948 num_pages, pos, write_bytes);
949 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
950 if (ret) { 993 if (ret) {
951 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
952 write_bytes);
953 goto out; 995 goto out;
954 } 996 }
955 997
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
965 btrfs_throttle(root); 1007 btrfs_throttle(root);
966 } 1008 }
967 1009
968 buf += write_bytes;
969 count -= write_bytes;
970 pos += write_bytes; 1010 pos += write_bytes;
971 num_written += write_bytes; 1011 num_written += write_bytes;
972 1012
@@ -976,9 +1016,7 @@ out:
976 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
977 if (ret) 1017 if (ret)
978 err = ret; 1018 err = ret;
979 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
980 1019
981out_nolock:
982 kfree(pages); 1020 kfree(pages);
983 if (pinned[0]) 1021 if (pinned[0])
984 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
1008 num_written = err; 1046 num_written = err;
1009 1047
1010 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1011 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1012 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1013 file->f_dentry); 1051 file->f_dentry);
1014 if (ret == 0) { 1052 if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
1023 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1024 } 1062 }
1025 } 1063 }
1026 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1027 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1028 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1029 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1104,9 +1142,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1104 if (file && file->private_data) 1142 if (file && file->private_data)
1105 btrfs_ioctl_trans_end(file); 1143 btrfs_ioctl_trans_end(file);
1106 1144
1107 trans = btrfs_start_transaction(root, 1); 1145 trans = btrfs_start_transaction(root, 0);
1108 if (!trans) { 1146 if (IS_ERR(trans)) {
1109 ret = -ENOMEM; 1147 ret = PTR_ERR(trans);
1110 goto out; 1148 goto out;
1111 } 1149 }
1112 1150
@@ -1161,7 +1199,7 @@ const struct file_operations btrfs_file_operations = {
1161 .read = do_sync_read, 1199 .read = do_sync_read,
1162 .aio_read = generic_file_aio_read, 1200 .aio_read = generic_file_aio_read,
1163 .splice_read = generic_file_splice_read, 1201 .splice_read = generic_file_splice_read,
1164 .write = btrfs_file_write, 1202 .aio_write = btrfs_file_aio_write,
1165 .mmap = btrfs_file_mmap, 1203 .mmap = btrfs_file_mmap,
1166 .open = generic_file_open, 1204 .open = generic_file_open,
1167 .release = btrfs_release_file, 1205 .release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..fa6ccc1bfe2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
252 inline_len, compressed_size, 252 inline_len, compressed_size,
253 compressed_pages); 253 compressed_pages);
254 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
255 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
256 return 0; 257 return 0;
257} 258}
@@ -414,6 +415,7 @@ again:
414 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
415 BUG_ON(!trans); 416 BUG_ON(!trans);
416 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
417 419
418 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
419 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
439 start, end, NULL, 441 start, end, NULL,
440 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
441 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
442 EXTENT_CLEAR_ACCOUNTING |
443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
444 445
445 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
697 return 0; 698 return 0;
698} 699}
699 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
700/* 733/*
701 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
702 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
734 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
735 BUG_ON(!trans); 768 BUG_ON(!trans);
736 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
737 771
738 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
739 773
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
753 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
754 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
755 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
756 EXTENT_CLEAR_ACCOUNTING |
757 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
758 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
759 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
769 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
770 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
771 804
772 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
773 read_lock(&BTRFS_I(inode)->extent_tree.lock);
774 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
775 start, num_bytes);
776 if (em) {
777 /*
778 * if block start isn't an actual block number then find the
779 * first block in this inode and use that as a hint. If that
780 * block is also bogus then just don't worry about it.
781 */
782 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
783 free_extent_map(em);
784 em = search_extent_mapping(em_tree, 0, 0);
785 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
786 alloc_hint = em->block_start;
787 if (em)
788 free_extent_map(em);
789 } else {
790 alloc_hint = em->block_start;
791 free_extent_map(em);
792 }
793 }
794 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
795 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
796 807
797 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
1174 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1175 BUG_ON(ret); 1186 BUG_ON(ret);
1176 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1177 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1178 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1179 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1226} 1244}
1227 1245
1228static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1229 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1230{ 1248{
1249 /* not delalloc, ignore it */
1231 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1232 return 0; 1251 return 0;
1233 1252
1234 spin_lock(&BTRFS_I(inode)->accounting_lock); 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1235 BTRFS_I(inode)->outstanding_extents++;
1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1237
1238 return 0; 1254 return 0;
1239} 1255}
1240 1256
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1252 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1253 return 0; 1269 return 0;
1254 1270
1255 spin_lock(&BTRFS_I(inode)->accounting_lock); 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1256 BTRFS_I(inode)->outstanding_extents--;
1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1258
1259 return 0; 1272 return 0;
1260} 1273}
1261 1274
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1265 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1266 */ 1279 */
1267static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1268 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1269{ 1282{
1270 1283
1271 /* 1284 /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1273 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1274 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1275 */ 1288 */
1276 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1277 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1278 1292
1279 spin_lock(&BTRFS_I(inode)->accounting_lock); 1293 if (*bits & EXTENT_FIRST_DELALLOC)
1280 BTRFS_I(inode)->outstanding_extents++; 1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1281 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1295 else
1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1283 1297
1284 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1286 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1287 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1288 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1289 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1297 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1298 */ 1312 */
1299static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1300 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1301{ 1315{
1302 /* 1316 /*
1303 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1304 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1305 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1306 */ 1320 */
1307 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1308 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1309 1324
1310 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1311 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents); 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1313 BTRFS_I(inode)->outstanding_extents--; 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1314 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1329
1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1316 } 1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1317 1335
1318 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1319 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1320 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1321 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1322 "%llu %llu\n",
1323 (unsigned long long)
1324 state->end - state->start + 1,
1325 (unsigned long long)
1326 root->fs_info->delalloc_bytes);
1327 btrfs_delalloc_free_space(root, inode, (u64)-1);
1328 root->fs_info->delalloc_bytes = 0;
1329 BTRFS_I(inode)->delalloc_bytes = 0;
1330 } else {
1331 btrfs_delalloc_free_space(root, inode,
1332 state->end -
1333 state->start + 1);
1334 root->fs_info->delalloc_bytes -= state->end -
1335 state->start + 1;
1336 BTRFS_I(inode)->delalloc_bytes -= state->end -
1337 state->start + 1;
1338 }
1339 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1340 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1341 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1384 */ 1385 */
1385static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1386 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1387 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1388{ 1390{
1389 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1390 int ret = 0; 1392 int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1403 * are inserted into the btree 1405 * are inserted into the btree
1404 */ 1406 */
1405static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1406 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1407{ 1410{
1408 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1409 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1414 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1415 */ 1418 */
1416static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1417 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1418{ 1422{
1419 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1420 int ret = 0; 1424 int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1439 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1440 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1441 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1442 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1443 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1444 } 1449 }
1445 1450
@@ -1520,6 +1525,7 @@ again:
1520 goto again; 1525 goto again;
1521 } 1526 }
1522 1527
1528 BUG();
1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1524 ClearPageChecked(page); 1530 ClearPageChecked(page);
1525out: 1531out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1650static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1651{ 1657{
1652 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1653 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1654 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1668 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1669 if (!ret) { 1675 if (!ret) {
1670 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1671 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1672 BUG_ON(ret); 1680 BUG_ON(ret);
1673 btrfs_end_transaction(trans, root);
1674 } 1681 }
1675 goto out; 1682 goto out;
1676 } 1683 }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1680 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1681 1688
1682 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1683 1692
1684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1685 compressed = 1; 1694 compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1712 &ordered_extent->list); 1721 &ordered_extent->list);
1713 1722
1714 /* this also removes the ordered extent from the tree */
1715 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1716 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1717 BUG_ON(ret); 1725 BUG_ON(ret);
1718 btrfs_end_transaction(trans, root);
1719out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1720 /* once for us */ 1730 /* once for us */
1721 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1722 /* once for the tree */ 1732 /* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1838 1848
1839 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1840 failrec->last_mirror, 1850 failrec->last_mirror,
1841 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1842 return 0; 1852 return 0;
1843} 1853}
1844 1854
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
1993} 2003}
1994 2004
1995/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
1996 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
1997 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
1998 */ 2128 */
1999int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2000{ 2130{
2001 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2002 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2003 2136
2004 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2005 2141
2006 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2007 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2008 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2009 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2148 }
2149
2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2010 } 2166 }
2011 2167
2012 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2013 2173
2014 spin_unlock(&root->list_lock); 2174 if (block_rsv)
2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2015 2176
2016 /* 2177 /* grab metadata reservation from transaction handle */
2017 * insert an orphan item to track this unlinked/truncated file 2178 if (reserve) {
2018 */ 2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2019 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2180 BUG_ON(ret);
2181 }
2020 2182
2021 return ret; 2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2022} 2196}
2023 2197
2024/* 2198/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2028int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2029{ 2203{
2030 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2031 int ret = 0; 2207 int ret = 0;
2032 2208
2033 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2034 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2035 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2036 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2037 return 0;
2038 } 2213 }
2039 2214
2040 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2041 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2042 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2043 return 0;
2044 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2045 2220
2046 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2047 2225
2048 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2049 2228
2050 return ret; 2229 return 0;
2051} 2230}
2052 2231
2053/* 2232/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2064 struct inode *inode; 2243 struct inode *inode;
2065 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2066 2245
2067 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2068 return; 2247 return;
2069 2248
2070 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2117 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2118 found_key.offset = 0; 2297 found_key.offset = 0;
2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2120 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2121 break;
2122 2300
2123 /* 2301 /*
2124 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2125 * the proper thing when we hit it 2303 * the proper thing when we hit it
2126 */ 2304 */
2127 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2128 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2129 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2130 2308
2131 /* 2309 /*
2132 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2135 * do a destroy_inode 2313 * do a destroy_inode
2136 */ 2314 */
2137 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2138 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2139 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2140 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2141 iput(inode); 2319 iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2154 iput(inode); 2332 iput(inode);
2155 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2156 2346
2157 if (nr_unlink) 2347 if (nr_unlink)
2158 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2159 if (nr_truncate) 2349 if (nr_truncate)
2160 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2161
2162 btrfs_free_path(path);
2163} 2351}
2164 2352
2165/* 2353/*
@@ -2478,29 +2666,201 @@ out:
2478 return ret; 2666 return ret;
2479} 2667}
2480 2668
2481static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2482{ 2701{
2483 struct btrfs_root *root;
2484 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2485 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2486 int ret; 2711 int ret;
2487 unsigned long nr = 0;
2488 2712
2489 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2490 2716
2491 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2492 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2493 * 1 for orphan 2719
2494 */ 2720 /* check if there is someone else holds reference */
2495 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2496 if (ret) 2722 return ERR_PTR(-ENOSPC);
2497 return ret; 2723
2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2498 2729
2499 trans = btrfs_start_transaction(root, 1); 2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2500 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2501 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2502 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2741 }
2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2503 } 2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2504 2864
2505 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2506 2866
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2508 2868
2509 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2510 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2511 2872
2512 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2513 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2514 2877
2515 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2516 2879 __unlink_end_trans(trans, root);
2517 btrfs_end_transaction_throttle(trans, root);
2518 btrfs_unreserve_metadata_space(root, 6);
2519 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2520 return ret; 2881 return ret;
2521} 2882}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2587{ 2948{
2588 struct inode *inode = dentry->d_inode; 2949 struct inode *inode = dentry->d_inode;
2589 int err = 0; 2950 int err = 0;
2590 int ret;
2591 struct btrfs_root *root = BTRFS_I(dir)->root; 2951 struct btrfs_root *root = BTRFS_I(dir)->root;
2592 struct btrfs_trans_handle *trans; 2952 struct btrfs_trans_handle *trans;
2593 unsigned long nr = 0; 2953 unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2596 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2956 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2597 return -ENOTEMPTY; 2957 return -ENOTEMPTY;
2598 2958
2599 ret = btrfs_reserve_metadata_space(root, 5); 2959 trans = __unlink_start_trans(dir, dentry);
2600 if (ret) 2960 if (IS_ERR(trans))
2601 return ret;
2602
2603 trans = btrfs_start_transaction(root, 1);
2604 if (IS_ERR(trans)) {
2605 btrfs_unreserve_metadata_space(root, 5);
2606 return PTR_ERR(trans); 2961 return PTR_ERR(trans);
2607 }
2608 2962
2609 btrfs_set_trans_block_group(trans, dir); 2963 btrfs_set_trans_block_group(trans, dir);
2610 2964
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2627 btrfs_i_size_write(inode, 0); 2981 btrfs_i_size_write(inode, 0);
2628out: 2982out:
2629 nr = trans->blocks_used; 2983 nr = trans->blocks_used;
2630 ret = btrfs_end_transaction_throttle(trans, root); 2984 __unlink_end_trans(trans, root);
2631 btrfs_unreserve_metadata_space(root, 5);
2632 btrfs_btree_balance_dirty(root, nr); 2985 btrfs_btree_balance_dirty(root, nr);
2633 2986
2634 if (ret && !err)
2635 err = ret;
2636 return err; 2987 return err;
2637} 2988}
2638 2989
@@ -3029,6 +3380,7 @@ out:
3029 if (pending_del_nr) { 3380 if (pending_del_nr) {
3030 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3381 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3031 pending_del_nr); 3382 pending_del_nr);
3383 BUG_ON(ret);
3032 } 3384 }
3033 btrfs_free_path(path); 3385 btrfs_free_path(path);
3034 return err; 3386 return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3056 3408
3057 if ((offset & (blocksize - 1)) == 0) 3409 if ((offset & (blocksize - 1)) == 0)
3058 goto out; 3410 goto out;
3059 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3411 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3060 if (ret)
3061 goto out;
3062
3063 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3064 if (ret) 3412 if (ret)
3065 goto out; 3413 goto out;
3066 3414
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3068again: 3416again:
3069 page = grab_cache_page(mapping, index); 3417 page = grab_cache_page(mapping, index);
3070 if (!page) { 3418 if (!page) {
3071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3419 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3072 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3073 goto out; 3420 goto out;
3074 } 3421 }
3075 3422
@@ -3132,8 +3479,7 @@ again:
3132 3479
3133out_unlock: 3480out_unlock:
3134 if (ret) 3481 if (ret)
3135 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3482 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3136 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3137 unlock_page(page); 3483 unlock_page(page);
3138 page_cache_release(page); 3484 page_cache_release(page);
3139out: 3485out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3145 struct btrfs_trans_handle *trans; 3491 struct btrfs_trans_handle *trans;
3146 struct btrfs_root *root = BTRFS_I(inode)->root; 3492 struct btrfs_root *root = BTRFS_I(inode)->root;
3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3493 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3148 struct extent_map *em; 3494 struct extent_map *em = NULL;
3149 struct extent_state *cached_state = NULL; 3495 struct extent_state *cached_state = NULL;
3150 u64 mask = root->sectorsize - 1; 3496 u64 mask = root->sectorsize - 1;
3151 u64 hole_start = (inode->i_size + mask) & ~mask; 3497 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3183 u64 hint_byte = 0; 3529 u64 hint_byte = 0;
3184 hole_size = last_byte - cur_offset; 3530 hole_size = last_byte - cur_offset;
3185 3531
3186 err = btrfs_reserve_metadata_space(root, 2); 3532 trans = btrfs_start_transaction(root, 2);
3187 if (err) 3533 if (IS_ERR(trans)) {
3534 err = PTR_ERR(trans);
3188 break; 3535 break;
3189 3536 }
3190 trans = btrfs_start_transaction(root, 1);
3191 btrfs_set_trans_block_group(trans, inode); 3537 btrfs_set_trans_block_group(trans, inode);
3192 3538
3193 err = btrfs_drop_extents(trans, inode, cur_offset, 3539 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3205 last_byte - 1, 0); 3551 last_byte - 1, 0);
3206 3552
3207 btrfs_end_transaction(trans, root); 3553 btrfs_end_transaction(trans, root);
3208 btrfs_unreserve_metadata_space(root, 2);
3209 } 3554 }
3210 free_extent_map(em); 3555 free_extent_map(em);
3556 em = NULL;
3211 cur_offset = last_byte; 3557 cur_offset = last_byte;
3212 if (cur_offset >= block_end) 3558 if (cur_offset >= block_end)
3213 break; 3559 break;
3214 } 3560 }
3215 3561
3562 free_extent_map(em);
3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3563 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS); 3564 GFP_NOFS);
3218 return err; 3565 return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3239 } 3586 }
3240 } 3587 }
3241 3588
3242 ret = btrfs_reserve_metadata_space(root, 1); 3589 trans = btrfs_start_transaction(root, 5);
3243 if (ret) 3590 if (IS_ERR(trans))
3244 return ret; 3591 return PTR_ERR(trans);
3245 3592
3246 trans = btrfs_start_transaction(root, 1);
3247 btrfs_set_trans_block_group(trans, inode); 3593 btrfs_set_trans_block_group(trans, inode);
3248 3594
3249 ret = btrfs_orphan_add(trans, inode); 3595 ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3251 3597
3252 nr = trans->blocks_used; 3598 nr = trans->blocks_used;
3253 btrfs_end_transaction(trans, root); 3599 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 1);
3255 btrfs_btree_balance_dirty(root, nr); 3600 btrfs_btree_balance_dirty(root, nr);
3256 3601
3257 if (attr->ia_size > inode->i_size) { 3602 if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3264 i_size_write(inode, attr->ia_size); 3609 i_size_write(inode, attr->ia_size);
3265 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3610 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3266 3611
3267 trans = btrfs_start_transaction(root, 1); 3612 trans = btrfs_start_transaction(root, 0);
3613 BUG_ON(IS_ERR(trans));
3268 btrfs_set_trans_block_group(trans, inode); 3614 btrfs_set_trans_block_group(trans, inode);
3615 trans->block_rsv = root->orphan_block_rsv;
3616 BUG_ON(!trans->block_rsv);
3269 3617
3270 ret = btrfs_update_inode(trans, root, inode); 3618 ret = btrfs_update_inode(trans, root, inode);
3271 BUG_ON(ret); 3619 BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
3345 btrfs_i_size_write(inode, 0); 3693 btrfs_i_size_write(inode, 0);
3346 3694
3347 while (1) { 3695 while (1) {
3348 trans = btrfs_start_transaction(root, 1); 3696 trans = btrfs_start_transaction(root, 0);
3697 BUG_ON(IS_ERR(trans));
3349 btrfs_set_trans_block_group(trans, inode); 3698 btrfs_set_trans_block_group(trans, inode);
3350 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3699 trans->block_rsv = root->orphan_block_rsv;
3700
3701 ret = btrfs_block_rsv_check(trans, root,
3702 root->orphan_block_rsv, 0, 5);
3703 if (ret) {
3704 BUG_ON(ret != -EAGAIN);
3705 ret = btrfs_commit_transaction(trans, root);
3706 BUG_ON(ret);
3707 continue;
3708 }
3351 3709
3710 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3352 if (ret != -EAGAIN) 3711 if (ret != -EAGAIN)
3353 break; 3712 break;
3354 3713
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
3356 btrfs_end_transaction(trans, root); 3715 btrfs_end_transaction(trans, root);
3357 trans = NULL; 3716 trans = NULL;
3358 btrfs_btree_balance_dirty(root, nr); 3717 btrfs_btree_balance_dirty(root, nr);
3718
3359 } 3719 }
3360 3720
3361 if (ret == 0) { 3721 if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
3596 return 0; 3956 return 0;
3597} 3957}
3598 3958
3599static noinline void init_btrfs_i(struct inode *inode)
3600{
3601 struct btrfs_inode *bi = BTRFS_I(inode);
3602
3603 bi->generation = 0;
3604 bi->sequence = 0;
3605 bi->last_trans = 0;
3606 bi->last_sub_trans = 0;
3607 bi->logged_trans = 0;
3608 bi->delalloc_bytes = 0;
3609 bi->reserved_bytes = 0;
3610 bi->disk_i_size = 0;
3611 bi->flags = 0;
3612 bi->index_cnt = (u64)-1;
3613 bi->last_unlink_trans = 0;
3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3618 inode->i_mapping, GFP_NOFS);
3619 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3620 inode->i_mapping, GFP_NOFS);
3621 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3622 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3623 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3624 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3625 mutex_init(&BTRFS_I(inode)->log_mutex);
3626}
3627
3628static int btrfs_init_locked_inode(struct inode *inode, void *p) 3959static int btrfs_init_locked_inode(struct inode *inode, void *p)
3629{ 3960{
3630 struct btrfs_iget_args *args = p; 3961 struct btrfs_iget_args *args = p;
3631 inode->i_ino = args->ino; 3962 inode->i_ino = args->ino;
3632 init_btrfs_i(inode);
3633 BTRFS_I(inode)->root = args->root; 3963 BTRFS_I(inode)->root = args->root;
3634 btrfs_set_inode_space_info(args->root, inode); 3964 btrfs_set_inode_space_info(args->root, inode);
3635 return 0; 3965 return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3692 if (!inode) 4022 if (!inode)
3693 return ERR_PTR(-ENOMEM); 4023 return ERR_PTR(-ENOMEM);
3694 4024
3695 init_btrfs_i(inode);
3696
3697 BTRFS_I(inode)->root = root; 4025 BTRFS_I(inode)->root = root;
3698 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4026 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3699 BTRFS_I(inode)->dummy_inode = 1; 4027 BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3950 struct btrfs_trans_handle *trans; 4278 struct btrfs_trans_handle *trans;
3951 int ret = 0; 4279 int ret = 0;
3952 4280
3953 if (root->fs_info->btree_inode == inode) 4281 if (BTRFS_I(inode)->dummy_inode)
3954 return 0; 4282 return 0;
3955 4283
3956 if (wbc->sync_mode == WB_SYNC_ALL) { 4284 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
3971{ 4299{
3972 struct btrfs_root *root = BTRFS_I(inode)->root; 4300 struct btrfs_root *root = BTRFS_I(inode)->root;
3973 struct btrfs_trans_handle *trans; 4301 struct btrfs_trans_handle *trans;
4302 int ret;
4303
4304 if (BTRFS_I(inode)->dummy_inode)
4305 return;
3974 4306
3975 trans = btrfs_join_transaction(root, 1); 4307 trans = btrfs_join_transaction(root, 1);
3976 btrfs_set_trans_block_group(trans, inode); 4308 btrfs_set_trans_block_group(trans, inode);
3977 btrfs_update_inode(trans, root, inode); 4309
4310 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) {
4312 /* whoops, lets try again with the full transaction */
4313 btrfs_end_transaction(trans, root);
4314 trans = btrfs_start_transaction(root, 1);
4315 if (IS_ERR(trans)) {
4316 if (printk_ratelimit()) {
4317 printk(KERN_ERR "btrfs: fail to "
4318 "dirty inode %lu error %ld\n",
4319 inode->i_ino, PTR_ERR(trans));
4320 }
4321 return;
4322 }
4323 btrfs_set_trans_block_group(trans, inode);
4324
4325 ret = btrfs_update_inode(trans, root, inode);
4326 if (ret) {
4327 if (printk_ratelimit()) {
4328 printk(KERN_ERR "btrfs: fail to "
4329 "dirty inode %lu error %d\n",
4330 inode->i_ino, ret);
4331 }
4332 }
4333 }
3978 btrfs_end_transaction(trans, root); 4334 btrfs_end_transaction(trans, root);
3979} 4335}
3980 4336
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4092 * btrfs_get_inode_index_count has an explanation for the magic 4448 * btrfs_get_inode_index_count has an explanation for the magic
4093 * number 4449 * number
4094 */ 4450 */
4095 init_btrfs_i(inode);
4096 BTRFS_I(inode)->index_cnt = 2; 4451 BTRFS_I(inode)->index_cnt = 2;
4097 BTRFS_I(inode)->root = root; 4452 BTRFS_I(inode)->root = root;
4098 BTRFS_I(inode)->generation = trans->transid; 4453 BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4247 if (!new_valid_dev(rdev)) 4602 if (!new_valid_dev(rdev))
4248 return -EINVAL; 4603 return -EINVAL;
4249 4604
4605 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4606 if (err)
4607 return err;
4608
4250 /* 4609 /*
4251 * 2 for inode item and ref 4610 * 2 for inode item and ref
4252 * 2 for dir items 4611 * 2 for dir items
4253 * 1 for xattr if selinux is on 4612 * 1 for xattr if selinux is on
4254 */ 4613 */
4255 err = btrfs_reserve_metadata_space(root, 5); 4614 trans = btrfs_start_transaction(root, 5);
4256 if (err) 4615 if (IS_ERR(trans))
4257 return err; 4616 return PTR_ERR(trans);
4258 4617
4259 trans = btrfs_start_transaction(root, 1);
4260 if (!trans)
4261 goto fail;
4262 btrfs_set_trans_block_group(trans, dir); 4618 btrfs_set_trans_block_group(trans, dir);
4263 4619
4264 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4265 if (err) {
4266 err = -ENOSPC;
4267 goto out_unlock;
4268 }
4269
4270 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4620 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4271 dentry->d_name.len, 4621 dentry->d_name.len,
4272 dentry->d_parent->d_inode->i_ino, objectid, 4622 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4295out_unlock: 4645out_unlock:
4296 nr = trans->blocks_used; 4646 nr = trans->blocks_used;
4297 btrfs_end_transaction_throttle(trans, root); 4647 btrfs_end_transaction_throttle(trans, root);
4298fail: 4648 btrfs_btree_balance_dirty(root, nr);
4299 btrfs_unreserve_metadata_space(root, 5);
4300 if (drop_inode) { 4649 if (drop_inode) {
4301 inode_dec_link_count(inode); 4650 inode_dec_link_count(inode);
4302 iput(inode); 4651 iput(inode);
4303 } 4652 }
4304 btrfs_btree_balance_dirty(root, nr);
4305 return err; 4653 return err;
4306} 4654}
4307 4655
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4311 struct btrfs_trans_handle *trans; 4659 struct btrfs_trans_handle *trans;
4312 struct btrfs_root *root = BTRFS_I(dir)->root; 4660 struct btrfs_root *root = BTRFS_I(dir)->root;
4313 struct inode *inode = NULL; 4661 struct inode *inode = NULL;
4314 int err;
4315 int drop_inode = 0; 4662 int drop_inode = 0;
4663 int err;
4316 unsigned long nr = 0; 4664 unsigned long nr = 0;
4317 u64 objectid; 4665 u64 objectid;
4318 u64 index = 0; 4666 u64 index = 0;
4319 4667
4668 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4669 if (err)
4670 return err;
4320 /* 4671 /*
4321 * 2 for inode item and ref 4672 * 2 for inode item and ref
4322 * 2 for dir items 4673 * 2 for dir items
4323 * 1 for xattr if selinux is on 4674 * 1 for xattr if selinux is on
4324 */ 4675 */
4325 err = btrfs_reserve_metadata_space(root, 5); 4676 trans = btrfs_start_transaction(root, 5);
4326 if (err) 4677 if (IS_ERR(trans))
4327 return err; 4678 return PTR_ERR(trans);
4328 4679
4329 trans = btrfs_start_transaction(root, 1);
4330 if (!trans)
4331 goto fail;
4332 btrfs_set_trans_block_group(trans, dir); 4680 btrfs_set_trans_block_group(trans, dir);
4333 4681
4334 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4335 if (err) {
4336 err = -ENOSPC;
4337 goto out_unlock;
4338 }
4339
4340 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4682 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4341 dentry->d_name.len, 4683 dentry->d_name.len,
4342 dentry->d_parent->d_inode->i_ino, 4684 dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4368out_unlock: 4710out_unlock:
4369 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4370 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
4371fail:
4372 btrfs_unreserve_metadata_space(root, 5);
4373 if (drop_inode) { 4713 if (drop_inode) {
4374 inode_dec_link_count(inode); 4714 inode_dec_link_count(inode);
4375 iput(inode); 4715 iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4396 if (root->objectid != BTRFS_I(inode)->root->objectid) 4736 if (root->objectid != BTRFS_I(inode)->root->objectid)
4397 return -EPERM; 4737 return -EPERM;
4398 4738
4399 /*
4400 * 1 item for inode ref
4401 * 2 items for dir items
4402 */
4403 err = btrfs_reserve_metadata_space(root, 3);
4404 if (err)
4405 return err;
4406
4407 btrfs_inc_nlink(inode); 4739 btrfs_inc_nlink(inode);
4408 4740
4409 err = btrfs_set_inode_index(dir, &index); 4741 err = btrfs_set_inode_index(dir, &index);
4410 if (err) 4742 if (err)
4411 goto fail; 4743 goto fail;
4412 4744
4413 trans = btrfs_start_transaction(root, 1); 4745 /*
4746 * 1 item for inode ref
4747 * 2 items for dir items
4748 */
4749 trans = btrfs_start_transaction(root, 3);
4750 if (IS_ERR(trans)) {
4751 err = PTR_ERR(trans);
4752 goto fail;
4753 }
4414 4754
4415 btrfs_set_trans_block_group(trans, dir); 4755 btrfs_set_trans_block_group(trans, dir);
4416 atomic_inc(&inode->i_count); 4756 atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4429 nr = trans->blocks_used; 4769 nr = trans->blocks_used;
4430 btrfs_end_transaction_throttle(trans, root); 4770 btrfs_end_transaction_throttle(trans, root);
4431fail: 4771fail:
4432 btrfs_unreserve_metadata_space(root, 3);
4433 if (drop_inode) { 4772 if (drop_inode) {
4434 inode_dec_link_count(inode); 4773 inode_dec_link_count(inode);
4435 iput(inode); 4774 iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4449 u64 index = 0; 4788 u64 index = 0;
4450 unsigned long nr = 1; 4789 unsigned long nr = 1;
4451 4790
4791 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4792 if (err)
4793 return err;
4794
4452 /* 4795 /*
4453 * 2 items for inode and ref 4796 * 2 items for inode and ref
4454 * 2 items for dir items 4797 * 2 items for dir items
4455 * 1 for xattr if selinux is on 4798 * 1 for xattr if selinux is on
4456 */ 4799 */
4457 err = btrfs_reserve_metadata_space(root, 5); 4800 trans = btrfs_start_transaction(root, 5);
4458 if (err) 4801 if (IS_ERR(trans))
4459 return err; 4802 return PTR_ERR(trans);
4460
4461 trans = btrfs_start_transaction(root, 1);
4462 if (!trans) {
4463 err = -ENOMEM;
4464 goto out_unlock;
4465 }
4466 btrfs_set_trans_block_group(trans, dir); 4803 btrfs_set_trans_block_group(trans, dir);
4467 4804
4468 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4469 if (err) {
4470 err = -ENOSPC;
4471 goto out_fail;
4472 }
4473
4474 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4475 dentry->d_name.len, 4806 dentry->d_name.len,
4476 dentry->d_parent->d_inode->i_ino, objectid, 4807 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4510out_fail: 4841out_fail:
4511 nr = trans->blocks_used; 4842 nr = trans->blocks_used;
4512 btrfs_end_transaction_throttle(trans, root); 4843 btrfs_end_transaction_throttle(trans, root);
4513
4514out_unlock:
4515 btrfs_unreserve_metadata_space(root, 5);
4516 if (drop_on_err) 4844 if (drop_on_err)
4517 iput(inode); 4845 iput(inode);
4518 btrfs_btree_balance_dirty(root, nr); 4846 btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
4770 } 5098 }
4771 flush_dcache_page(page); 5099 flush_dcache_page(page);
4772 } else if (create && PageUptodate(page)) { 5100 } else if (create && PageUptodate(page)) {
5101 WARN_ON(1);
4773 if (!trans) { 5102 if (!trans) {
4774 kunmap(page); 5103 kunmap(page);
4775 free_extent_map(em); 5104 free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
4866 return em; 5195 return em;
4867} 5196}
4868 5197
5198static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5199 u64 start, u64 len)
5200{
5201 struct btrfs_root *root = BTRFS_I(inode)->root;
5202 struct btrfs_trans_handle *trans;
5203 struct extent_map *em;
5204 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5205 struct btrfs_key ins;
5206 u64 alloc_hint;
5207 int ret;
5208
5209 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5210
5211 trans = btrfs_join_transaction(root, 0);
5212 if (!trans)
5213 return ERR_PTR(-ENOMEM);
5214
5215 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5216
5217 alloc_hint = get_extent_allocation_hint(inode, start, len);
5218 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5219 alloc_hint, (u64)-1, &ins, 1);
5220 if (ret) {
5221 em = ERR_PTR(ret);
5222 goto out;
5223 }
5224
5225 em = alloc_extent_map(GFP_NOFS);
5226 if (!em) {
5227 em = ERR_PTR(-ENOMEM);
5228 goto out;
5229 }
5230
5231 em->start = start;
5232 em->orig_start = em->start;
5233 em->len = ins.offset;
5234
5235 em->block_start = ins.objectid;
5236 em->block_len = ins.offset;
5237 em->bdev = root->fs_info->fs_devices->latest_bdev;
5238 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5239
5240 while (1) {
5241 write_lock(&em_tree->lock);
5242 ret = add_extent_mapping(em_tree, em);
5243 write_unlock(&em_tree->lock);
5244 if (ret != -EEXIST)
5245 break;
5246 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5247 }
5248
5249 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5250 ins.offset, ins.offset, 0);
5251 if (ret) {
5252 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5253 em = ERR_PTR(ret);
5254 }
5255out:
5256 btrfs_end_transaction(trans, root);
5257 return em;
5258}
5259
5260/*
5261 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5262 * block must be cow'd
5263 */
5264static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5265 struct inode *inode, u64 offset, u64 len)
5266{
5267 struct btrfs_path *path;
5268 int ret;
5269 struct extent_buffer *leaf;
5270 struct btrfs_root *root = BTRFS_I(inode)->root;
5271 struct btrfs_file_extent_item *fi;
5272 struct btrfs_key key;
5273 u64 disk_bytenr;
5274 u64 backref_offset;
5275 u64 extent_end;
5276 u64 num_bytes;
5277 int slot;
5278 int found_type;
5279
5280 path = btrfs_alloc_path();
5281 if (!path)
5282 return -ENOMEM;
5283
5284 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5285 offset, 0);
5286 if (ret < 0)
5287 goto out;
5288
5289 slot = path->slots[0];
5290 if (ret == 1) {
5291 if (slot == 0) {
5292 /* can't find the item, must cow */
5293 ret = 0;
5294 goto out;
5295 }
5296 slot--;
5297 }
5298 ret = 0;
5299 leaf = path->nodes[0];
5300 btrfs_item_key_to_cpu(leaf, &key, slot);
5301 if (key.objectid != inode->i_ino ||
5302 key.type != BTRFS_EXTENT_DATA_KEY) {
5303 /* not our file or wrong item type, must cow */
5304 goto out;
5305 }
5306
5307 if (key.offset > offset) {
5308 /* Wrong offset, must cow */
5309 goto out;
5310 }
5311
5312 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5313 found_type = btrfs_file_extent_type(leaf, fi);
5314 if (found_type != BTRFS_FILE_EXTENT_REG &&
5315 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5316 /* not a regular extent, must cow */
5317 goto out;
5318 }
5319 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5320 backref_offset = btrfs_file_extent_offset(leaf, fi);
5321
5322 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5323 if (extent_end < offset + len) {
5324 /* extent doesn't include our full range, must cow */
5325 goto out;
5326 }
5327
5328 if (btrfs_extent_readonly(root, disk_bytenr))
5329 goto out;
5330
5331 /*
5332 * look for other files referencing this extent, if we
5333 * find any we must cow
5334 */
5335 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5336 key.offset - backref_offset, disk_bytenr))
5337 goto out;
5338
5339 /*
5340 * adjust disk_bytenr and num_bytes to cover just the bytes
5341 * in this extent we are about to write. If there
5342 * are any csums in that range we have to cow in order
5343 * to keep the csums correct
5344 */
5345 disk_bytenr += backref_offset;
5346 disk_bytenr += offset - key.offset;
5347 num_bytes = min(offset + len, extent_end) - offset;
5348 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5349 goto out;
5350 /*
5351 * all of the above have passed, it is safe to overwrite this extent
5352 * without cow
5353 */
5354 ret = 1;
5355out:
5356 btrfs_free_path(path);
5357 return ret;
5358}
5359
5360static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5361 struct buffer_head *bh_result, int create)
5362{
5363 struct extent_map *em;
5364 struct btrfs_root *root = BTRFS_I(inode)->root;
5365 u64 start = iblock << inode->i_blkbits;
5366 u64 len = bh_result->b_size;
5367 struct btrfs_trans_handle *trans;
5368
5369 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5370 if (IS_ERR(em))
5371 return PTR_ERR(em);
5372
5373 /*
5374 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5375 * io. INLINE is special, and we could probably kludge it in here, but
5376 * it's still buffered so for safety lets just fall back to the generic
5377 * buffered path.
5378 *
5379 * For COMPRESSED we _have_ to read the entire extent in so we can
5380 * decompress it, so there will be buffering required no matter what we
5381 * do, so go ahead and fallback to buffered.
5382 *
5383 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5384 * to buffered IO. Don't blame me, this is the price we pay for using
5385 * the generic code.
5386 */
5387 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5388 em->block_start == EXTENT_MAP_INLINE) {
5389 free_extent_map(em);
5390 return -ENOTBLK;
5391 }
5392
5393 /* Just a good old fashioned hole, return */
5394 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5395 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5396 free_extent_map(em);
5397 /* DIO will do one hole at a time, so just unlock a sector */
5398 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5399 start + root->sectorsize - 1, GFP_NOFS);
5400 return 0;
5401 }
5402
5403 /*
5404 * We don't allocate a new extent in the following cases
5405 *
5406 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5407 * existing extent.
5408 * 2) The extent is marked as PREALLOC. We're good to go here and can
5409 * just use the extent.
5410 *
5411 */
5412 if (!create) {
5413 len = em->len - (start - em->start);
5414 goto map;
5415 }
5416
5417 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5418 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5419 em->block_start != EXTENT_MAP_HOLE)) {
5420 int type;
5421 int ret;
5422 u64 block_start;
5423
5424 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5425 type = BTRFS_ORDERED_PREALLOC;
5426 else
5427 type = BTRFS_ORDERED_NOCOW;
5428 len = min(len, em->len - (start - em->start));
5429 block_start = em->block_start + (start - em->start);
5430
5431 /*
5432 * we're not going to log anything, but we do need
5433 * to make sure the current transaction stays open
5434 * while we look for nocow cross refs
5435 */
5436 trans = btrfs_join_transaction(root, 0);
5437 if (!trans)
5438 goto must_cow;
5439
5440 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5441 ret = btrfs_add_ordered_extent_dio(inode, start,
5442 block_start, len, len, type);
5443 btrfs_end_transaction(trans, root);
5444 if (ret) {
5445 free_extent_map(em);
5446 return ret;
5447 }
5448 goto unlock;
5449 }
5450 btrfs_end_transaction(trans, root);
5451 }
5452must_cow:
5453 /*
5454 * this will cow the extent, reset the len in case we changed
5455 * it above
5456 */
5457 len = bh_result->b_size;
5458 free_extent_map(em);
5459 em = btrfs_new_extent_direct(inode, start, len);
5460 if (IS_ERR(em))
5461 return PTR_ERR(em);
5462 len = min(len, em->len - (start - em->start));
5463unlock:
5464 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5465 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5466 0, NULL, GFP_NOFS);
5467map:
5468 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5469 inode->i_blkbits;
5470 bh_result->b_size = len;
5471 bh_result->b_bdev = em->bdev;
5472 set_buffer_mapped(bh_result);
5473 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5474 set_buffer_new(bh_result);
5475
5476 free_extent_map(em);
5477
5478 return 0;
5479}
5480
5481struct btrfs_dio_private {
5482 struct inode *inode;
5483 u64 logical_offset;
5484 u64 disk_bytenr;
5485 u64 bytes;
5486 u32 *csums;
5487 void *private;
5488};
5489
5490static void btrfs_endio_direct_read(struct bio *bio, int err)
5491{
5492 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5493 struct bio_vec *bvec = bio->bi_io_vec;
5494 struct btrfs_dio_private *dip = bio->bi_private;
5495 struct inode *inode = dip->inode;
5496 struct btrfs_root *root = BTRFS_I(inode)->root;
5497 u64 start;
5498 u32 *private = dip->csums;
5499
5500 start = dip->logical_offset;
5501 do {
5502 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5503 struct page *page = bvec->bv_page;
5504 char *kaddr;
5505 u32 csum = ~(u32)0;
5506 unsigned long flags;
5507
5508 local_irq_save(flags);
5509 kaddr = kmap_atomic(page, KM_IRQ0);
5510 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5511 csum, bvec->bv_len);
5512 btrfs_csum_final(csum, (char *)&csum);
5513 kunmap_atomic(kaddr, KM_IRQ0);
5514 local_irq_restore(flags);
5515
5516 flush_dcache_page(bvec->bv_page);
5517 if (csum != *private) {
5518 printk(KERN_ERR "btrfs csum failed ino %lu off"
5519 " %llu csum %u private %u\n",
5520 inode->i_ino, (unsigned long long)start,
5521 csum, *private);
5522 err = -EIO;
5523 }
5524 }
5525
5526 start += bvec->bv_len;
5527 private++;
5528 bvec++;
5529 } while (bvec <= bvec_end);
5530
5531 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5532 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5533 bio->bi_private = dip->private;
5534
5535 kfree(dip->csums);
5536 kfree(dip);
5537 dio_end_io(bio, err);
5538}
5539
5540static void btrfs_endio_direct_write(struct bio *bio, int err)
5541{
5542 struct btrfs_dio_private *dip = bio->bi_private;
5543 struct inode *inode = dip->inode;
5544 struct btrfs_root *root = BTRFS_I(inode)->root;
5545 struct btrfs_trans_handle *trans;
5546 struct btrfs_ordered_extent *ordered = NULL;
5547 struct extent_state *cached_state = NULL;
5548 int ret;
5549
5550 if (err)
5551 goto out_done;
5552
5553 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5554 dip->logical_offset, dip->bytes);
5555 if (!ret)
5556 goto out_done;
5557
5558 BUG_ON(!ordered);
5559
5560 trans = btrfs_join_transaction(root, 1);
5561 if (!trans) {
5562 err = -ENOMEM;
5563 goto out;
5564 }
5565 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5566
5567 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5568 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5569 if (!ret)
5570 ret = btrfs_update_inode(trans, root, inode);
5571 err = ret;
5572 goto out;
5573 }
5574
5575 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5576 ordered->file_offset + ordered->len - 1, 0,
5577 &cached_state, GFP_NOFS);
5578
5579 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5580 ret = btrfs_mark_extent_written(trans, inode,
5581 ordered->file_offset,
5582 ordered->file_offset +
5583 ordered->len);
5584 if (ret) {
5585 err = ret;
5586 goto out_unlock;
5587 }
5588 } else {
5589 ret = insert_reserved_file_extent(trans, inode,
5590 ordered->file_offset,
5591 ordered->start,
5592 ordered->disk_len,
5593 ordered->len,
5594 ordered->len,
5595 0, 0, 0,
5596 BTRFS_FILE_EXTENT_REG);
5597 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5598 ordered->file_offset, ordered->len);
5599 if (ret) {
5600 err = ret;
5601 WARN_ON(1);
5602 goto out_unlock;
5603 }
5604 }
5605
5606 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5607 btrfs_ordered_update_i_size(inode, 0, ordered);
5608 btrfs_update_inode(trans, root, inode);
5609out_unlock:
5610 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5611 ordered->file_offset + ordered->len - 1,
5612 &cached_state, GFP_NOFS);
5613out:
5614 btrfs_delalloc_release_metadata(inode, ordered->len);
5615 btrfs_end_transaction(trans, root);
5616 btrfs_put_ordered_extent(ordered);
5617 btrfs_put_ordered_extent(ordered);
5618out_done:
5619 bio->bi_private = dip->private;
5620
5621 kfree(dip->csums);
5622 kfree(dip);
5623 dio_end_io(bio, err);
5624}
5625
5626static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5627 struct bio *bio, int mirror_num,
5628 unsigned long bio_flags, u64 offset)
5629{
5630 int ret;
5631 struct btrfs_root *root = BTRFS_I(inode)->root;
5632 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5633 BUG_ON(ret);
5634 return 0;
5635}
5636
5637static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5638 loff_t file_offset)
5639{
5640 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum;
5645 int write = rw & (1 << BIO_RW);
5646 int ret = 0;
5647
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5649
5650 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5651 if (!dip) {
5652 ret = -ENOMEM;
5653 goto free_ordered;
5654 }
5655 dip->csums = NULL;
5656
5657 if (!skip_sum) {
5658 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5659 if (!dip->csums) {
5660 ret = -ENOMEM;
5661 goto free_ordered;
5662 }
5663 }
5664
5665 dip->private = bio->bi_private;
5666 dip->inode = inode;
5667 dip->logical_offset = file_offset;
5668
5669 start = dip->logical_offset;
5670 dip->bytes = 0;
5671 do {
5672 dip->bytes += bvec->bv_len;
5673 bvec++;
5674 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5675
5676 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5677 bio->bi_private = dip;
5678
5679 if (write)
5680 bio->bi_end_io = btrfs_endio_direct_write;
5681 else
5682 bio->bi_end_io = btrfs_endio_direct_read;
5683
5684 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5685 if (ret)
5686 goto out_err;
5687
5688 if (write && !skip_sum) {
5689 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5690 inode, rw, bio, 0, 0,
5691 dip->logical_offset,
5692 __btrfs_submit_bio_start_direct_io,
5693 __btrfs_submit_bio_done);
5694 if (ret)
5695 goto out_err;
5696 return;
5697 } else if (!skip_sum)
5698 btrfs_lookup_bio_sums_dio(root, inode, bio,
5699 dip->logical_offset, dip->csums);
5700
5701 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5702 if (ret)
5703 goto out_err;
5704 return;
5705out_err:
5706 kfree(dip->csums);
5707 kfree(dip);
5708free_ordered:
5709 /*
5710 * If this is a write, we need to clean up the reserved space and kill
5711 * the ordered extent.
5712 */
5713 if (write) {
5714 struct btrfs_ordered_extent *ordered;
5715 ordered = btrfs_lookup_ordered_extent(inode,
5716 dip->logical_offset);
5717 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5718 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5719 btrfs_free_reserved_extent(root, ordered->start,
5720 ordered->disk_len);
5721 btrfs_put_ordered_extent(ordered);
5722 btrfs_put_ordered_extent(ordered);
5723 }
5724 bio_endio(bio, ret);
5725}
5726
5727static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5728 const struct iovec *iov, loff_t offset,
5729 unsigned long nr_segs)
5730{
5731 int seg;
5732 size_t size;
5733 unsigned long addr;
5734 unsigned blocksize_mask = root->sectorsize - 1;
5735 ssize_t retval = -EINVAL;
5736 loff_t end = offset;
5737
5738 if (offset & blocksize_mask)
5739 goto out;
5740
5741 /* Check the memory alignment. Blocks cannot straddle pages */
5742 for (seg = 0; seg < nr_segs; seg++) {
5743 addr = (unsigned long)iov[seg].iov_base;
5744 size = iov[seg].iov_len;
5745 end += size;
5746 if ((addr & blocksize_mask) || (size & blocksize_mask))
5747 goto out;
5748 }
5749 retval = 0;
5750out:
5751 return retval;
5752}
4869static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5753static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4870 const struct iovec *iov, loff_t offset, 5754 const struct iovec *iov, loff_t offset,
4871 unsigned long nr_segs) 5755 unsigned long nr_segs)
4872{ 5756{
4873 return -EINVAL; 5757 struct file *file = iocb->ki_filp;
5758 struct inode *inode = file->f_mapping->host;
5759 struct btrfs_ordered_extent *ordered;
5760 struct extent_state *cached_state = NULL;
5761 u64 lockstart, lockend;
5762 ssize_t ret;
5763 int writing = rw & WRITE;
5764 int write_bits = 0;
5765 size_t count = iov_length(iov, nr_segs);
5766
5767 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5768 offset, nr_segs)) {
5769 return 0;
5770 }
5771
5772 lockstart = offset;
5773 lockend = offset + count - 1;
5774
5775 if (writing) {
5776 ret = btrfs_delalloc_reserve_space(inode, count);
5777 if (ret)
5778 goto out;
5779 }
5780
5781 while (1) {
5782 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5783 0, &cached_state, GFP_NOFS);
5784 /*
5785 * We're concerned with the entire range that we're going to be
5786 * doing DIO to, so we need to make sure theres no ordered
5787 * extents in this range.
5788 */
5789 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5790 lockend - lockstart + 1);
5791 if (!ordered)
5792 break;
5793 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5794 &cached_state, GFP_NOFS);
5795 btrfs_start_ordered_extent(inode, ordered, 1);
5796 btrfs_put_ordered_extent(ordered);
5797 cond_resched();
5798 }
5799
5800 /*
5801 * we don't use btrfs_set_extent_delalloc because we don't want
5802 * the dirty or uptodate bits
5803 */
5804 if (writing) {
5805 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5806 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 EXTENT_DELALLOC, 0, NULL, &cached_state,
5808 GFP_NOFS);
5809 if (ret) {
5810 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5811 lockend, EXTENT_LOCKED | write_bits,
5812 1, 0, &cached_state, GFP_NOFS);
5813 goto out;
5814 }
5815 }
5816
5817 free_extent_state(cached_state);
5818 cached_state = NULL;
5819
5820 ret = __blockdev_direct_IO(rw, iocb, inode,
5821 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5822 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5823 btrfs_submit_direct, 0);
5824
5825 if (ret < 0 && ret != -EIOCBQUEUED) {
5826 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5827 offset + iov_length(iov, nr_segs) - 1,
5828 EXTENT_LOCKED | write_bits, 1, 0,
5829 &cached_state, GFP_NOFS);
5830 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5831 /*
5832 * We're falling back to buffered, unlock the section we didn't
5833 * do IO on.
5834 */
5835 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5836 offset + iov_length(iov, nr_segs) - 1,
5837 EXTENT_LOCKED | write_bits, 1, 0,
5838 &cached_state, GFP_NOFS);
5839 }
5840out:
5841 free_extent_state(cached_state);
5842 return ret;
4874} 5843}
4875 5844
4876static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5845static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5034 u64 page_start; 6003 u64 page_start;
5035 u64 page_end; 6004 u64 page_end;
5036 6005
5037 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6006 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5038 if (ret) { 6007 if (ret) {
5039 if (ret == -ENOMEM) 6008 if (ret == -ENOMEM)
5040 ret = VM_FAULT_OOM; 6009 ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5043 goto out; 6012 goto out;
5044 } 6013 }
5045 6014
5046 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5047 if (ret) {
5048 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5049 ret = VM_FAULT_SIGBUS;
5050 goto out;
5051 }
5052
5053 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6015 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5054again: 6016again:
5055 lock_page(page); 6017 lock_page(page);
@@ -5059,7 +6021,6 @@ again:
5059 6021
5060 if ((page->mapping != inode->i_mapping) || 6022 if ((page->mapping != inode->i_mapping) ||
5061 (page_start >= size)) { 6023 (page_start >= size)) {
5062 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5063 /* page got truncated out from underneath us */ 6024 /* page got truncated out from underneath us */
5064 goto out_unlock; 6025 goto out_unlock;
5065 } 6026 }
@@ -5100,7 +6061,6 @@ again:
5100 unlock_extent_cached(io_tree, page_start, page_end, 6061 unlock_extent_cached(io_tree, page_start, page_end,
5101 &cached_state, GFP_NOFS); 6062 &cached_state, GFP_NOFS);
5102 ret = VM_FAULT_SIGBUS; 6063 ret = VM_FAULT_SIGBUS;
5103 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5104 goto out_unlock; 6064 goto out_unlock;
5105 } 6065 }
5106 ret = 0; 6066 ret = 0;
@@ -5127,10 +6087,10 @@ again:
5127 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6087 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5128 6088
5129out_unlock: 6089out_unlock:
5130 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5131 if (!ret) 6090 if (!ret)
5132 return VM_FAULT_LOCKED; 6091 return VM_FAULT_LOCKED;
5133 unlock_page(page); 6092 unlock_page(page);
6093 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5134out: 6094out:
5135 return ret; 6095 return ret;
5136} 6096}
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
5155 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6115 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5156 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6116 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5157 6117
5158 trans = btrfs_start_transaction(root, 1); 6118 trans = btrfs_start_transaction(root, 0);
6119 BUG_ON(IS_ERR(trans));
5159 btrfs_set_trans_block_group(trans, inode); 6120 btrfs_set_trans_block_group(trans, inode);
6121 trans->block_rsv = root->orphan_block_rsv;
5160 6122
5161 /* 6123 /*
5162 * setattr is responsible for setting the ordered_data_close flag, 6124 * setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
5179 btrfs_add_ordered_operation(trans, root, inode); 6141 btrfs_add_ordered_operation(trans, root, inode);
5180 6142
5181 while (1) { 6143 while (1) {
6144 if (!trans) {
6145 trans = btrfs_start_transaction(root, 0);
6146 BUG_ON(IS_ERR(trans));
6147 btrfs_set_trans_block_group(trans, inode);
6148 trans->block_rsv = root->orphan_block_rsv;
6149 }
6150
6151 ret = btrfs_block_rsv_check(trans, root,
6152 root->orphan_block_rsv, 0, 5);
6153 if (ret) {
6154 BUG_ON(ret != -EAGAIN);
6155 ret = btrfs_commit_transaction(trans, root);
6156 BUG_ON(ret);
6157 trans = NULL;
6158 continue;
6159 }
6160
5182 ret = btrfs_truncate_inode_items(trans, root, inode, 6161 ret = btrfs_truncate_inode_items(trans, root, inode,
5183 inode->i_size, 6162 inode->i_size,
5184 BTRFS_EXTENT_DATA_KEY); 6163 BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
5190 6169
5191 nr = trans->blocks_used; 6170 nr = trans->blocks_used;
5192 btrfs_end_transaction(trans, root); 6171 btrfs_end_transaction(trans, root);
6172 trans = NULL;
5193 btrfs_btree_balance_dirty(root, nr); 6173 btrfs_btree_balance_dirty(root, nr);
5194
5195 trans = btrfs_start_transaction(root, 1);
5196 btrfs_set_trans_block_group(trans, inode);
5197 } 6174 }
5198 6175
5199 if (ret == 0 && inode->i_nlink > 0) { 6176 if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5254struct inode *btrfs_alloc_inode(struct super_block *sb) 6231struct inode *btrfs_alloc_inode(struct super_block *sb)
5255{ 6232{
5256 struct btrfs_inode *ei; 6233 struct btrfs_inode *ei;
6234 struct inode *inode;
5257 6235
5258 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6236 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5259 if (!ei) 6237 if (!ei)
5260 return NULL; 6238 return NULL;
6239
6240 ei->root = NULL;
6241 ei->space_info = NULL;
6242 ei->generation = 0;
6243 ei->sequence = 0;
5261 ei->last_trans = 0; 6244 ei->last_trans = 0;
5262 ei->last_sub_trans = 0; 6245 ei->last_sub_trans = 0;
5263 ei->logged_trans = 0; 6246 ei->logged_trans = 0;
5264 ei->outstanding_extents = 0; 6247 ei->delalloc_bytes = 0;
5265 ei->reserved_extents = 0; 6248 ei->reserved_bytes = 0;
5266 ei->root = NULL; 6249 ei->disk_i_size = 0;
6250 ei->flags = 0;
6251 ei->index_cnt = (u64)-1;
6252 ei->last_unlink_trans = 0;
6253
5267 spin_lock_init(&ei->accounting_lock); 6254 spin_lock_init(&ei->accounting_lock);
6255 atomic_set(&ei->outstanding_extents, 0);
6256 ei->reserved_extents = 0;
6257
6258 ei->ordered_data_close = 0;
6259 ei->orphan_meta_reserved = 0;
6260 ei->dummy_inode = 0;
6261 ei->force_compress = 0;
6262
6263 inode = &ei->vfs_inode;
6264 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6265 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6266 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6267 mutex_init(&ei->log_mutex);
5268 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6268 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5269 INIT_LIST_HEAD(&ei->i_orphan); 6269 INIT_LIST_HEAD(&ei->i_orphan);
6270 INIT_LIST_HEAD(&ei->delalloc_inodes);
5270 INIT_LIST_HEAD(&ei->ordered_operations); 6271 INIT_LIST_HEAD(&ei->ordered_operations);
5271 return &ei->vfs_inode; 6272 RB_CLEAR_NODE(&ei->rb_node);
6273
6274 return inode;
5272} 6275}
5273 6276
5274void btrfs_destroy_inode(struct inode *inode) 6277void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
5278 6281
5279 WARN_ON(!list_empty(&inode->i_dentry)); 6282 WARN_ON(!list_empty(&inode->i_dentry));
5280 WARN_ON(inode->i_data.nrpages); 6283 WARN_ON(inode->i_data.nrpages);
6284 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6285 WARN_ON(BTRFS_I(inode)->reserved_extents);
5281 6286
5282 /* 6287 /*
5283 * This can happen where we create an inode, but somebody else also 6288 * This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
5298 spin_unlock(&root->fs_info->ordered_extent_lock); 6303 spin_unlock(&root->fs_info->ordered_extent_lock);
5299 } 6304 }
5300 6305
5301 spin_lock(&root->list_lock); 6306 spin_lock(&root->orphan_lock);
5302 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5303 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5304 inode->i_ino); 6309 inode->i_ino);
5305 list_del_init(&BTRFS_I(inode)->i_orphan); 6310 list_del_init(&BTRFS_I(inode)->i_orphan);
5306 } 6311 }
5307 spin_unlock(&root->list_lock); 6312 spin_unlock(&root->orphan_lock);
5308 6313
5309 while (1) { 6314 while (1) {
5310 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6315 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5425 if (S_ISDIR(old_inode->i_mode) && new_inode && 6430 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5426 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6431 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5427 return -ENOTEMPTY; 6432 return -ENOTEMPTY;
5428
5429 /*
5430 * We want to reserve the absolute worst case amount of items. So if
5431 * both inodes are subvols and we need to unlink them then that would
5432 * require 4 item modifications, but if they are both normal inodes it
5433 * would require 5 item modifications, so we'll assume their normal
5434 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5435 * should cover the worst case number of items we'll modify.
5436 */
5437 ret = btrfs_reserve_metadata_space(root, 11);
5438 if (ret)
5439 return ret;
5440
5441 /* 6433 /*
5442 * we're using rename to replace one file with another. 6434 * we're using rename to replace one file with another.
5443 * and the replacement file is large. Start IO on it now so 6435 * and the replacement file is large. Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5450 /* close the racy window with snapshot create/destroy ioctl */ 6442 /* close the racy window with snapshot create/destroy ioctl */
5451 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6443 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5452 down_read(&root->fs_info->subvol_sem); 6444 down_read(&root->fs_info->subvol_sem);
6445 /*
6446 * We want to reserve the absolute worst case amount of items. So if
6447 * both inodes are subvols and we need to unlink them then that would
6448 * require 4 item modifications, but if they are both normal inodes it
6449 * would require 5 item modifications, so we'll assume their normal
6450 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6451 * should cover the worst case number of items we'll modify.
6452 */
6453 trans = btrfs_start_transaction(root, 20);
6454 if (IS_ERR(trans))
6455 return PTR_ERR(trans);
5453 6456
5454 trans = btrfs_start_transaction(root, 1);
5455 btrfs_set_trans_block_group(trans, new_dir); 6457 btrfs_set_trans_block_group(trans, new_dir);
5456 6458
5457 if (dest != root) 6459 if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
5550 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6552 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5551 up_read(&root->fs_info->subvol_sem); 6553 up_read(&root->fs_info->subvol_sem);
5552 6554
5553 btrfs_unreserve_metadata_space(root, 11);
5554 return ret; 6555 return ret;
5555} 6556}
5556 6557
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5602 return 0; 6603 return 0;
5603} 6604}
5604 6605
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6607{
6608 struct btrfs_inode *binode;
6609 struct inode *inode = NULL;
6610
6611 spin_lock(&root->fs_info->delalloc_lock);
6612 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6613 binode = list_entry(root->fs_info->delalloc_inodes.next,
6614 struct btrfs_inode, delalloc_inodes);
6615 inode = igrab(&binode->vfs_inode);
6616 if (inode) {
6617 list_move_tail(&binode->delalloc_inodes,
6618 &root->fs_info->delalloc_inodes);
6619 break;
6620 }
6621
6622 list_del_init(&binode->delalloc_inodes);
6623 cond_resched_lock(&root->fs_info->delalloc_lock);
6624 }
6625 spin_unlock(&root->fs_info->delalloc_lock);
6626
6627 if (inode) {
6628 write_inode_now(inode, 0);
6629 if (delay_iput)
6630 btrfs_add_delayed_iput(inode);
6631 else
6632 iput(inode);
6633 return 1;
6634 }
6635 return 0;
6636}
6637
5605static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6638static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5606 const char *symname) 6639 const char *symname)
5607{ 6640{
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5625 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6658 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5626 return -ENAMETOOLONG; 6659 return -ENAMETOOLONG;
5627 6660
6661 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6662 if (err)
6663 return err;
5628 /* 6664 /*
5629 * 2 items for inode item and ref 6665 * 2 items for inode item and ref
5630 * 2 items for dir items 6666 * 2 items for dir items
5631 * 1 item for xattr if selinux is on 6667 * 1 item for xattr if selinux is on
5632 */ 6668 */
5633 err = btrfs_reserve_metadata_space(root, 5); 6669 trans = btrfs_start_transaction(root, 5);
5634 if (err) 6670 if (IS_ERR(trans))
5635 return err; 6671 return PTR_ERR(trans);
5636 6672
5637 trans = btrfs_start_transaction(root, 1);
5638 if (!trans)
5639 goto out_fail;
5640 btrfs_set_trans_block_group(trans, dir); 6673 btrfs_set_trans_block_group(trans, dir);
5641 6674
5642 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5643 if (err) {
5644 err = -ENOSPC;
5645 goto out_unlock;
5646 }
5647
5648 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6675 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5649 dentry->d_name.len, 6676 dentry->d_name.len,
5650 dentry->d_parent->d_inode->i_ino, objectid, 6677 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5716out_unlock: 6743out_unlock:
5717 nr = trans->blocks_used; 6744 nr = trans->blocks_used;
5718 btrfs_end_transaction_throttle(trans, root); 6745 btrfs_end_transaction_throttle(trans, root);
5719out_fail:
5720 btrfs_unreserve_metadata_space(root, 5);
5721 if (drop_inode) { 6746 if (drop_inode) {
5722 inode_dec_link_count(inode); 6747 inode_dec_link_count(inode);
5723 iput(inode); 6748 iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
5726 return err; 6751 return err;
5727} 6752}
5728 6753
5729static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6754int btrfs_prealloc_file_range(struct inode *inode, int mode,
5730 u64 alloc_hint, int mode, loff_t actual_len) 6755 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint)
5731{ 6757{
5732 struct btrfs_trans_handle *trans; 6758 struct btrfs_trans_handle *trans;
5733 struct btrfs_root *root = BTRFS_I(inode)->root; 6759 struct btrfs_root *root = BTRFS_I(inode)->root;
5734 struct btrfs_key ins; 6760 struct btrfs_key ins;
5735 u64 cur_offset = start; 6761 u64 cur_offset = start;
5736 u64 num_bytes = end - start;
5737 int ret = 0; 6762 int ret = 0;
5738 u64 i_size;
5739 6763
5740 while (num_bytes > 0) { 6764 while (num_bytes > 0) {
5741 trans = btrfs_start_transaction(root, 1); 6765 trans = btrfs_start_transaction(root, 3);
5742 6766 if (IS_ERR(trans)) {
5743 ret = btrfs_reserve_extent(trans, root, num_bytes, 6767 ret = PTR_ERR(trans);
5744 root->sectorsize, 0, alloc_hint, 6768 break;
5745 (u64)-1, &ins, 1);
5746 if (ret) {
5747 WARN_ON(1);
5748 goto stop_trans;
5749 } 6769 }
5750 6770
5751 ret = btrfs_reserve_metadata_space(root, 3); 6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1);
5752 if (ret) { 6773 if (ret) {
5753 btrfs_free_reserved_extent(root, ins.objectid, 6774 btrfs_end_transaction(trans, root);
5754 ins.offset); 6775 break;
5755 goto stop_trans;
5756 } 6776 }
5757 6777
5758 ret = insert_reserved_file_extent(trans, inode, 6778 ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5766 6786
5767 num_bytes -= ins.offset; 6787 num_bytes -= ins.offset;
5768 cur_offset += ins.offset; 6788 cur_offset += ins.offset;
5769 alloc_hint = ins.objectid + ins.offset; 6789 *alloc_hint = ins.objectid + ins.offset;
5770 6790
5771 inode->i_ctime = CURRENT_TIME; 6791 inode->i_ctime = CURRENT_TIME;
5772 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6792 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5773 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5774 (actual_len > inode->i_size) && 6794 (actual_len > inode->i_size) &&
5775 (cur_offset > inode->i_size)) { 6795 (cur_offset > inode->i_size)) {
5776
5777 if (cur_offset > actual_len) 6796 if (cur_offset > actual_len)
5778 i_size = actual_len; 6797 i_size_write(inode, actual_len);
5779 else 6798 else
5780 i_size = cur_offset; 6799 i_size_write(inode, cur_offset);
5781 i_size_write(inode, i_size); 6800 i_size_write(inode, cur_offset);
5782 btrfs_ordered_update_i_size(inode, i_size, NULL); 6801 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5783 } 6802 }
5784 6803
5785 ret = btrfs_update_inode(trans, root, inode); 6804 ret = btrfs_update_inode(trans, root, inode);
5786 BUG_ON(ret); 6805 BUG_ON(ret);
5787 6806
5788 btrfs_end_transaction(trans, root); 6807 btrfs_end_transaction(trans, root);
5789 btrfs_unreserve_metadata_space(root, 3);
5790 } 6808 }
5791 return ret; 6809 return ret;
5792
5793stop_trans:
5794 btrfs_end_transaction(trans, root);
5795 return ret;
5796
5797} 6810}
5798 6811
5799static long btrfs_fallocate(struct inode *inode, int mode, 6812static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5826 goto out; 6839 goto out;
5827 } 6840 }
5828 6841
5829 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6842 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5830 alloc_end - alloc_start);
5831 if (ret) 6843 if (ret)
5832 goto out; 6844 goto out;
5833 6845
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5872 if (em->block_start == EXTENT_MAP_HOLE || 6884 if (em->block_start == EXTENT_MAP_HOLE ||
5873 (cur_offset >= inode->i_size && 6885 (cur_offset >= inode->i_size &&
5874 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6886 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5875 ret = prealloc_file_range(inode, 6887 ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
5876 cur_offset, last_byte, 6888 last_byte - cur_offset,
5877 alloc_hint, mode, offset+len); 6889 1 << inode->i_blkbits,
6890 offset + len,
6891 &alloc_hint);
5878 if (ret < 0) { 6892 if (ret < 0) {
5879 free_extent_map(em); 6893 free_extent_map(em);
5880 break; 6894 break;
5881 } 6895 }
5882 } 6896 }
5883 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5884 alloc_hint = em->block_start;
5885 free_extent_map(em); 6897 free_extent_map(em);
5886 6898
5887 cur_offset = last_byte; 6899 cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5893 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6905 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5894 &cached_state, GFP_NOFS); 6906 &cached_state, GFP_NOFS);
5895 6907
5896 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6908 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5897 alloc_end - alloc_start);
5898out: 6909out:
5899 mutex_unlock(&inode->i_mutex); 6910 mutex_unlock(&inode->i_mutex);
5900 return ret; 6911 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4cdb98cf26de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1300 if (err) 1277 if (err)
1301 goto out_up_write; 1278 goto out_up_write;
1302 1279
1303 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1304 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1305 dest->root_key.objectid, 1288 dest->root_key.objectid,
1306 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1314 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1315 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1316 1299
1317 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1318 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1319 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1320 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1321 1306
1322 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1323 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1358 ret = -EPERM; 1343 ret = -EPERM;
1359 goto out; 1344 goto out;
1360 } 1345 }
1361 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1362 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1363 break; 1350 break;
1364 case S_IFREG: 1351 case S_IFREG:
1365 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1389 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1; 1377 range->len = (u64)-1;
1391 } 1378 }
1392 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1393 kfree(range); 1380 kfree(range);
1394 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1395 } 1384 }
1396out: 1385out:
1397 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1550 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1551 } 1540 }
1552 1541
1553 trans = btrfs_start_transaction(root, 1);
1554 BUG_ON(!trans);
1555
1556 /* punch hole in destination first */
1557 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1558
1559 /* clone data */ 1542 /* clone data */
1560 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1561 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1566 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1567 * tree. 1550 * tree.
1568 */ 1551 */
1569 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1570 if (ret < 0) 1553 if (ret < 0)
1571 goto out; 1554 goto out;
1572 1555
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1629 new_key.objectid = inode->i_ino; 1612 new_key.objectid = inode->i_ino;
1630 new_key.offset = key.offset + destoff - off; 1613 new_key.offset = key.offset + destoff - off;
1631 1614
1615 trans = btrfs_start_transaction(root, 1);
1616 if (IS_ERR(trans)) {
1617 ret = PTR_ERR(trans);
1618 goto out;
1619 }
1620
1632 if (type == BTRFS_FILE_EXTENT_REG || 1621 if (type == BTRFS_FILE_EXTENT_REG ||
1633 type == BTRFS_FILE_EXTENT_PREALLOC) { 1622 type == BTRFS_FILE_EXTENT_PREALLOC) {
1623 if (off > key.offset) {
1624 datao += off - key.offset;
1625 datal -= off - key.offset;
1626 }
1627
1628 if (key.offset + datal > off + len)
1629 datal = off + len - key.offset;
1630
1631 ret = btrfs_drop_extents(trans, inode,
1632 new_key.offset,
1633 new_key.offset + datal,
1634 &hint_byte, 1);
1635 BUG_ON(ret);
1636
1634 ret = btrfs_insert_empty_item(trans, root, path, 1637 ret = btrfs_insert_empty_item(trans, root, path,
1635 &new_key, size); 1638 &new_key, size);
1636 if (ret) 1639 BUG_ON(ret);
1637 goto out;
1638 1640
1639 leaf = path->nodes[0]; 1641 leaf = path->nodes[0];
1640 slot = path->slots[0]; 1642 slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1645 extent = btrfs_item_ptr(leaf, slot, 1647 extent = btrfs_item_ptr(leaf, slot,
1646 struct btrfs_file_extent_item); 1648 struct btrfs_file_extent_item);
1647 1649
1648 if (off > key.offset) {
1649 datao += off - key.offset;
1650 datal -= off - key.offset;
1651 }
1652
1653 if (key.offset + datal > off + len)
1654 datal = off + len - key.offset;
1655
1656 /* disko == 0 means it's a hole */ 1650 /* disko == 0 means it's a hole */
1657 if (!disko) 1651 if (!disko)
1658 datao = 0; 1652 datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1683 1677
1684 if (comp && (skip || trim)) { 1678 if (comp && (skip || trim)) {
1685 ret = -EINVAL; 1679 ret = -EINVAL;
1680 btrfs_end_transaction(trans, root);
1686 goto out; 1681 goto out;
1687 } 1682 }
1688 size -= skip + trim; 1683 size -= skip + trim;
1689 datal -= skip + trim; 1684 datal -= skip + trim;
1685
1686 ret = btrfs_drop_extents(trans, inode,
1687 new_key.offset,
1688 new_key.offset + datal,
1689 &hint_byte, 1);
1690 BUG_ON(ret);
1691
1690 ret = btrfs_insert_empty_item(trans, root, path, 1692 ret = btrfs_insert_empty_item(trans, root, path,
1691 &new_key, size); 1693 &new_key, size);
1692 if (ret) 1694 BUG_ON(ret);
1693 goto out;
1694 1695
1695 if (skip) { 1696 if (skip) {
1696 u32 start = 1697 u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1708 } 1709 }
1709 1710
1710 btrfs_mark_buffer_dirty(leaf); 1711 btrfs_mark_buffer_dirty(leaf);
1711 } 1712 btrfs_release_path(root, path);
1712 1713
1714 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1715 if (new_key.offset + datal > inode->i_size)
1716 btrfs_i_size_write(inode,
1717 new_key.offset + datal);
1718 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1719 ret = btrfs_update_inode(trans, root, inode);
1720 BUG_ON(ret);
1721 btrfs_end_transaction(trans, root);
1722 }
1713next: 1723next:
1714 btrfs_release_path(root, path); 1724 btrfs_release_path(root, path);
1715 key.offset++; 1725 key.offset++;
@@ -1717,17 +1727,7 @@ next:
1717 ret = 0; 1727 ret = 0;
1718out: 1728out:
1719 btrfs_release_path(root, path); 1729 btrfs_release_path(root, path);
1720 if (ret == 0) {
1721 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1722 if (destoff + olen > inode->i_size)
1723 btrfs_i_size_write(inode, destoff + olen);
1724 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1725 ret = btrfs_update_inode(trans, root, inode);
1726 }
1727 btrfs_end_transaction(trans, root);
1728 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1730 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1729 if (ret)
1730 vmtruncate(inode, 0);
1731out_unlock: 1731out_unlock:
1732 mutex_unlock(&src->i_mutex); 1732 mutex_unlock(&src->i_mutex);
1733 mutex_unlock(&inode->i_mutex); 1733 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
124 return 1; 124 return 1;
125} 125}
126 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
127/* 136/*
128 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
129 * the first one less than this offset 138 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
161 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
162 * inserted. 171 * inserted.
163 */ 172 */
164int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
165 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
166{ 176{
167 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
168 struct rb_node *node; 178 struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
182 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
183 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
184 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
185 /* one ref for the tree */ 198 /* one ref for the tree */
186 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
187 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
203 return 0; 216 return 0;
204} 217}
205 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
206/* 233/*
207 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
208 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
316 BTRFS_I(inode)->outstanding_extents--;
317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
319 inode, 1);
320
321 spin_lock(&root->fs_info->ordered_extent_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
322 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
323 343
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
491 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
492 * for pdflush to find them 512 * for pdflush to find them
493 */ 513 */
494 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
495 if (wait) { 516 if (wait) {
496 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
497 &entry->flags)); 518 &entry->flags));
@@ -588,6 +609,47 @@ out:
588 return entry; 609 return entry;
589} 610}
590 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
591/* 653/*
592 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
593 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..05d41e569236 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
44struct backref_node { 44struct backref_node {
45 struct rb_node rb_node; 45 struct rb_node rb_node;
46 u64 bytenr; 46 u64 bytenr;
47 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
48 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
49 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
50 struct list_head upper; 54 struct list_head upper;
51 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
56 struct extent_buffer *eb; 60 struct extent_buffer *eb;
57 /* level of tree block */ 61 /* level of tree block */
58 unsigned int level:8; 62 unsigned int level:8;
59 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
60 unsigned int old_root:1; 64 unsigned int cowonly:1;
61 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
62 unsigned int lowest:1; 66 unsigned int lowest:1;
63 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
64 unsigned int locked:1; 68 unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
66 unsigned int processed:1; 70 unsigned int processed:1;
67 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
68 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
69}; 83};
70 84
71/* 85/*
@@ -74,7 +88,6 @@ struct backref_node {
74struct backref_edge { 88struct backref_edge {
75 struct list_head list[2]; 89 struct list_head list[2];
76 struct backref_node *node[2]; 90 struct backref_node *node[2];
77 u64 blockptr;
78}; 91};
79 92
80#define LOWER 0 93#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
83struct backref_cache { 96struct backref_cache {
84 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
85 struct rb_root rb_root; 98 struct rb_root rb_root;
86 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
87 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
88 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
89}; 118};
90 119
91/* 120/*
@@ -113,15 +142,6 @@ struct tree_block {
113 unsigned int key_ready:1; 142 unsigned int key_ready:1;
114}; 143};
115 144
116/* inode vector */
117#define INODEVEC_SIZE 16
118
119struct inodevec {
120 struct list_head list;
121 struct inode *inode[INODEVEC_SIZE];
122 int nr;
123};
124
125#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
126 146
127struct file_extent_cluster { 147struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
138 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
139 /* inode for moving data */ 159 /* inode for moving data */
140 struct inode *data_inode; 160 struct inode *data_inode;
141 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
142 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
143 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
144 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
145 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
146 /* list of reloc trees */ 171 /* list of reloc trees */
147 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
148 u64 search_start; 178 u64 search_start;
149 u64 extents_found; 179 u64 extents_found;
150 u64 extents_skipped; 180
151 int stage; 181 int block_rsv_retries;
152 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
153 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
154 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
155}; 188};
156 189
157/* stages of data relocation */ 190/* stages of data relocation */
158#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
159#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
160 193
161/* 194static void remove_backref_node(struct backref_cache *cache,
162 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
163 */ 196static void __mark_block_processed(struct reloc_control *rc,
164struct async_merge { 197 struct backref_node *node);
165 struct btrfs_work work;
166 struct reloc_control *rc;
167 struct btrfs_root *root;
168 struct completion *done;
169 atomic_t *num_pending;
170};
171 198
172static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
173{ 200{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
181 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
182 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
184 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
185} 276}
186 277
187static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
188{ 280{
189 memset(node, 0, sizeof(*node)); 281 if (edge) {
190 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
191 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
192 RB_CLEAR_NODE(&node->rb_node); 284 }
193} 285}
194 286
195static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
250 edges[idx++] = edge; 342 edges[idx++] = edge;
251 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
252 } 344 }
345 BUG_ON(node->detached);
253 *index = idx; 346 *index = idx;
254 return node; 347 return node;
255} 348}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
281 return NULL; 374 return NULL;
282} 375}
283 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
284static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
285{ 386{
286 if (node->eb) { 387 if (node->eb) {
287 if (node->locked) { 388 unlock_node_buffer(node);
288 btrfs_tree_unlock(node->eb);
289 node->locked = 0;
290 }
291 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
292 node->eb = NULL; 390 node->eb = NULL;
293 } 391 }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
296static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
297 struct backref_node *node) 395 struct backref_node *node)
298{ 396{
299 BUG_ON(!node->lowest);
300 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
301 398
302 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
303 list_del(&node->lower); 401 list_del(&node->lower);
304 402 if (!RB_EMPTY_NODE(&node->rb_node))
305 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
306 kfree(node); 404 free_backref_node(tree, node);
307} 405}
308 406
309/* 407/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
318 if (!node) 416 if (!node)
319 return; 417 return;
320 418
321 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
322 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
323 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
324 list[LOWER]); 422 list[LOWER]);
325 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
326 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
327 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
328 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
329 /* 435 /*
330 * add the node to pending list if no other 436 * add the node to leaf node list if no other
331 * child block cached. 437 * child block cached.
332 */ 438 */
333 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
334 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
335 &cache->pending[upper->level]);
336 upper->lowest = 1; 441 upper->lowest = 1;
337 } 442 }
338 } 443 }
444
339 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
340} 446}
341 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
342/* 534/*
343 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
344 */ 536 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
453 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
454 * block are also cached. 646 * block are also cached.
455 */ 647 */
456static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
457 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
458 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
459 int level, u64 bytenr) 651 int level, u64 bytenr)
460{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
461 struct btrfs_path *path1; 654 struct btrfs_path *path1;
462 struct btrfs_path *path2; 655 struct btrfs_path *path2;
463 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
473 unsigned long end; 666 unsigned long end;
474 unsigned long ptr; 667 unsigned long ptr;
475 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
476 int ret; 671 int ret;
477 int err = 0; 672 int err = 0;
478 673
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
483 goto out; 678 goto out;
484 } 679 }
485 680
486 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
487 if (!node) { 682 if (!node) {
488 err = -ENOMEM; 683 err = -ENOMEM;
489 goto out; 684 goto out;
490 } 685 }
491 686
492 backref_node_init(node);
493 node->bytenr = bytenr; 687 node->bytenr = bytenr;
494 node->owner = 0;
495 node->level = level; 688 node->level = level;
496 node->lowest = 1; 689 node->lowest = 1;
497 cur = node; 690 cur = node;
@@ -587,17 +780,20 @@ again:
587#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 key.type == BTRFS_EXTENT_REF_V0_KEY) {
592 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
593 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
594 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
595 root = find_tree_root(rc, eb, ref0); 787 root = find_tree_root(rc, eb, ref0);
596 if (root) 788 if (!root->ref_cows)
597 cur->root = root; 789 cur->cowonly = 1;
598 else 790 if (key.objectid == key.offset) {
599 cur->old_root = 1; 791 if (root && !should_ignore_root(root))
600 break; 792 cur->root = root;
793 else
794 list_add(&cur->list, &useless);
795 break;
796 }
601 } 797 }
602#else 798#else
603 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 799 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +810,20 @@ again:
614 break; 810 break;
615 } 811 }
616 812
617 edge = kzalloc(sizeof(*edge), GFP_NOFS); 813 edge = alloc_backref_edge(cache);
618 if (!edge) { 814 if (!edge) {
619 err = -ENOMEM; 815 err = -ENOMEM;
620 goto out; 816 goto out;
621 } 817 }
622 rb_node = tree_search(&cache->rb_root, key.offset); 818 rb_node = tree_search(&cache->rb_root, key.offset);
623 if (!rb_node) { 819 if (!rb_node) {
624 upper = kmalloc(sizeof(*upper), GFP_NOFS); 820 upper = alloc_backref_node(cache);
625 if (!upper) { 821 if (!upper) {
626 kfree(edge); 822 free_backref_edge(cache, edge);
627 err = -ENOMEM; 823 err = -ENOMEM;
628 goto out; 824 goto out;
629 } 825 }
630 backref_node_init(upper);
631 upper->bytenr = key.offset; 826 upper->bytenr = key.offset;
632 upper->owner = 0;
633 upper->level = cur->level + 1; 827 upper->level = cur->level + 1;
634 /* 828 /*
635 * backrefs for the upper level block isn't 829 * backrefs for the upper level block isn't
@@ -639,11 +833,12 @@ again:
639 } else { 833 } else {
640 upper = rb_entry(rb_node, struct backref_node, 834 upper = rb_entry(rb_node, struct backref_node,
641 rb_node); 835 rb_node);
836 BUG_ON(!upper->checked);
642 INIT_LIST_HEAD(&edge->list[UPPER]); 837 INIT_LIST_HEAD(&edge->list[UPPER]);
643 } 838 }
644 list_add(&edge->list[LOWER], &cur->upper); 839 list_add_tail(&edge->list[LOWER], &cur->upper);
645 edge->node[UPPER] = upper;
646 edge->node[LOWER] = cur; 840 edge->node[LOWER] = cur;
841 edge->node[UPPER] = upper;
647 842
648 goto next; 843 goto next;
649 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 844 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +852,17 @@ again:
657 goto out; 852 goto out;
658 } 853 }
659 854
855 if (!root->ref_cows)
856 cur->cowonly = 1;
857
660 if (btrfs_root_level(&root->root_item) == cur->level) { 858 if (btrfs_root_level(&root->root_item) == cur->level) {
661 /* tree root */ 859 /* tree root */
662 BUG_ON(btrfs_root_bytenr(&root->root_item) != 860 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663 cur->bytenr); 861 cur->bytenr);
664 cur->root = root; 862 if (should_ignore_root(root))
863 list_add(&cur->list, &useless);
864 else
865 cur->root = root;
665 break; 866 break;
666 } 867 }
667 868
@@ -692,11 +893,14 @@ again:
692 if (!path2->nodes[level]) { 893 if (!path2->nodes[level]) {
693 BUG_ON(btrfs_root_bytenr(&root->root_item) != 894 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694 lower->bytenr); 895 lower->bytenr);
695 lower->root = root; 896 if (should_ignore_root(root))
897 list_add(&lower->list, &useless);
898 else
899 lower->root = root;
696 break; 900 break;
697 } 901 }
698 902
699 edge = kzalloc(sizeof(*edge), GFP_NOFS); 903 edge = alloc_backref_edge(cache);
700 if (!edge) { 904 if (!edge) {
701 err = -ENOMEM; 905 err = -ENOMEM;
702 goto out; 906 goto out;
@@ -705,16 +909,17 @@ again:
705 eb = path2->nodes[level]; 909 eb = path2->nodes[level];
706 rb_node = tree_search(&cache->rb_root, eb->start); 910 rb_node = tree_search(&cache->rb_root, eb->start);
707 if (!rb_node) { 911 if (!rb_node) {
708 upper = kmalloc(sizeof(*upper), GFP_NOFS); 912 upper = alloc_backref_node(cache);
709 if (!upper) { 913 if (!upper) {
710 kfree(edge); 914 free_backref_edge(cache, edge);
711 err = -ENOMEM; 915 err = -ENOMEM;
712 goto out; 916 goto out;
713 } 917 }
714 backref_node_init(upper);
715 upper->bytenr = eb->start; 918 upper->bytenr = eb->start;
716 upper->owner = btrfs_header_owner(eb); 919 upper->owner = btrfs_header_owner(eb);
717 upper->level = lower->level + 1; 920 upper->level = lower->level + 1;
921 if (!root->ref_cows)
922 upper->cowonly = 1;
718 923
719 /* 924 /*
720 * if we know the block isn't shared 925 * if we know the block isn't shared
@@ -744,10 +949,12 @@ again:
744 rb_node); 949 rb_node);
745 BUG_ON(!upper->checked); 950 BUG_ON(!upper->checked);
746 INIT_LIST_HEAD(&edge->list[UPPER]); 951 INIT_LIST_HEAD(&edge->list[UPPER]);
952 if (!upper->owner)
953 upper->owner = btrfs_header_owner(eb);
747 } 954 }
748 list_add_tail(&edge->list[LOWER], &lower->upper); 955 list_add_tail(&edge->list[LOWER], &lower->upper);
749 edge->node[UPPER] = upper;
750 edge->node[LOWER] = lower; 956 edge->node[LOWER] = lower;
957 edge->node[UPPER] = upper;
751 958
752 if (rb_node) 959 if (rb_node)
753 break; 960 break;
@@ -785,8 +992,13 @@ next:
785 * into the cache. 992 * into the cache.
786 */ 993 */
787 BUG_ON(!node->checked); 994 BUG_ON(!node->checked);
788 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 995 cowonly = node->cowonly;
789 BUG_ON(rb_node); 996 if (!cowonly) {
997 rb_node = tree_insert(&cache->rb_root, node->bytenr,
998 &node->rb_node);
999 BUG_ON(rb_node);
1000 list_add_tail(&node->lower, &cache->leaves);
1001 }
790 1002
791 list_for_each_entry(edge, &node->upper, list[LOWER]) 1003 list_for_each_entry(edge, &node->upper, list[LOWER])
792 list_add_tail(&edge->list[UPPER], &list); 1004 list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1007,14 @@ next:
795 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1007 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796 list_del_init(&edge->list[UPPER]); 1008 list_del_init(&edge->list[UPPER]);
797 upper = edge->node[UPPER]; 1009 upper = edge->node[UPPER];
1010 if (upper->detached) {
1011 list_del(&edge->list[LOWER]);
1012 lower = edge->node[LOWER];
1013 free_backref_edge(cache, edge);
1014 if (list_empty(&lower->upper))
1015 list_add(&lower->list, &useless);
1016 continue;
1017 }
798 1018
799 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1019 if (!RB_EMPTY_NODE(&upper->rb_node)) {
800 if (upper->lowest) { 1020 if (upper->lowest) {
@@ -807,25 +1027,69 @@ next:
807 } 1027 }
808 1028
809 BUG_ON(!upper->checked); 1029 BUG_ON(!upper->checked);
810 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1030 BUG_ON(cowonly != upper->cowonly);
811 &upper->rb_node); 1031 if (!cowonly) {
812 BUG_ON(rb_node); 1032 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1033 &upper->rb_node);
1034 BUG_ON(rb_node);
1035 }
813 1036
814 list_add_tail(&edge->list[UPPER], &upper->lower); 1037 list_add_tail(&edge->list[UPPER], &upper->lower);
815 1038
816 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1039 list_for_each_entry(edge, &upper->upper, list[LOWER])
817 list_add_tail(&edge->list[UPPER], &list); 1040 list_add_tail(&edge->list[UPPER], &list);
818 } 1041 }
1042 /*
1043 * process useless backref nodes. backref nodes for tree leaves
1044 * are deleted from the cache. backref nodes for upper level
1045 * tree blocks are left in the cache to avoid unnecessary backref
1046 * lookup.
1047 */
1048 while (!list_empty(&useless)) {
1049 upper = list_entry(useless.next, struct backref_node, list);
1050 list_del_init(&upper->list);
1051 BUG_ON(!list_empty(&upper->upper));
1052 if (upper == node)
1053 node = NULL;
1054 if (upper->lowest) {
1055 list_del_init(&upper->lower);
1056 upper->lowest = 0;
1057 }
1058 while (!list_empty(&upper->lower)) {
1059 edge = list_entry(upper->lower.next,
1060 struct backref_edge, list[UPPER]);
1061 list_del(&edge->list[UPPER]);
1062 list_del(&edge->list[LOWER]);
1063 lower = edge->node[LOWER];
1064 free_backref_edge(cache, edge);
1065
1066 if (list_empty(&lower->upper))
1067 list_add(&lower->list, &useless);
1068 }
1069 __mark_block_processed(rc, upper);
1070 if (upper->level > 0) {
1071 list_add(&upper->list, &cache->detached);
1072 upper->detached = 1;
1073 } else {
1074 rb_erase(&upper->rb_node, &cache->rb_root);
1075 free_backref_node(cache, upper);
1076 }
1077 }
819out: 1078out:
820 btrfs_free_path(path1); 1079 btrfs_free_path(path1);
821 btrfs_free_path(path2); 1080 btrfs_free_path(path2);
822 if (err) { 1081 if (err) {
823 INIT_LIST_HEAD(&list); 1082 while (!list_empty(&useless)) {
1083 lower = list_entry(useless.next,
1084 struct backref_node, upper);
1085 list_del_init(&lower->upper);
1086 }
824 upper = node; 1087 upper = node;
1088 INIT_LIST_HEAD(&list);
825 while (upper) { 1089 while (upper) {
826 if (RB_EMPTY_NODE(&upper->rb_node)) { 1090 if (RB_EMPTY_NODE(&upper->rb_node)) {
827 list_splice_tail(&upper->upper, &list); 1091 list_splice_tail(&upper->upper, &list);
828 kfree(upper); 1092 free_backref_node(cache, upper);
829 } 1093 }
830 1094
831 if (list_empty(&list)) 1095 if (list_empty(&list))
@@ -833,15 +1097,104 @@ out:
833 1097
834 edge = list_entry(list.next, struct backref_edge, 1098 edge = list_entry(list.next, struct backref_edge,
835 list[LOWER]); 1099 list[LOWER]);
1100 list_del(&edge->list[LOWER]);
836 upper = edge->node[UPPER]; 1101 upper = edge->node[UPPER];
837 kfree(edge); 1102 free_backref_edge(cache, edge);
838 } 1103 }
839 return ERR_PTR(err); 1104 return ERR_PTR(err);
840 } 1105 }
1106 BUG_ON(node && node->detached);
841 return node; 1107 return node;
842} 1108}
843 1109
844/* 1110/*
1111 * helper to add backref node for the newly created snapshot.
1112 * the backref node is created by cloning backref node that
1113 * corresponds to root of source tree
1114 */
1115static int clone_backref_node(struct btrfs_trans_handle *trans,
1116 struct reloc_control *rc,
1117 struct btrfs_root *src,
1118 struct btrfs_root *dest)
1119{
1120 struct btrfs_root *reloc_root = src->reloc_root;
1121 struct backref_cache *cache = &rc->backref_cache;
1122 struct backref_node *node = NULL;
1123 struct backref_node *new_node;
1124 struct backref_edge *edge;
1125 struct backref_edge *new_edge;
1126 struct rb_node *rb_node;
1127
1128 if (cache->last_trans > 0)
1129 update_backref_cache(trans, cache);
1130
1131 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1132 if (rb_node) {
1133 node = rb_entry(rb_node, struct backref_node, rb_node);
1134 if (node->detached)
1135 node = NULL;
1136 else
1137 BUG_ON(node->new_bytenr != reloc_root->node->start);
1138 }
1139
1140 if (!node) {
1141 rb_node = tree_search(&cache->rb_root,
1142 reloc_root->commit_root->start);
1143 if (rb_node) {
1144 node = rb_entry(rb_node, struct backref_node,
1145 rb_node);
1146 BUG_ON(node->detached);
1147 }
1148 }
1149
1150 if (!node)
1151 return 0;
1152
1153 new_node = alloc_backref_node(cache);
1154 if (!new_node)
1155 return -ENOMEM;
1156
1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest;
1160 new_node->root = dest;
1161
1162 if (!node->lowest) {
1163 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1164 new_edge = alloc_backref_edge(cache);
1165 if (!new_edge)
1166 goto fail;
1167
1168 new_edge->node[UPPER] = new_node;
1169 new_edge->node[LOWER] = edge->node[LOWER];
1170 list_add_tail(&new_edge->list[UPPER],
1171 &new_node->lower);
1172 }
1173 }
1174
1175 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1176 &new_node->rb_node);
1177 BUG_ON(rb_node);
1178
1179 if (!new_node->lowest) {
1180 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1181 list_add_tail(&new_edge->list[LOWER],
1182 &new_edge->node[LOWER]->upper);
1183 }
1184 }
1185 return 0;
1186fail:
1187 while (!list_empty(&new_node->lower)) {
1188 new_edge = list_entry(new_node->lower.next,
1189 struct backref_edge, list[UPPER]);
1190 list_del(&new_edge->list[UPPER]);
1191 free_backref_edge(cache, new_edge);
1192 }
1193 free_backref_node(cache, new_node);
1194 return -ENOMEM;
1195}
1196
1197/*
845 * helper to add 'address of tree root -> reloc tree' mapping 1198 * helper to add 'address of tree root -> reloc tree' mapping
846 */ 1199 */
847static int __add_reloc_root(struct btrfs_root *root) 1200static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
901 return 0; 1254 return 0;
902} 1255}
903 1256
904/* 1257static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
905 * create reloc tree for a given fs tree. reloc tree is just a 1258 struct btrfs_root *root, u64 objectid)
906 * snapshot of the fs tree with special root objectid.
907 */
908int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909 struct btrfs_root *root)
910{ 1259{
911 struct btrfs_root *reloc_root; 1260 struct btrfs_root *reloc_root;
912 struct extent_buffer *eb; 1261 struct extent_buffer *eb;
@@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
914 struct btrfs_key root_key; 1263 struct btrfs_key root_key;
915 int ret; 1264 int ret;
916 1265
917 if (root->reloc_root) {
918 reloc_root = root->reloc_root;
919 reloc_root->last_trans = trans->transid;
920 return 0;
921 }
922
923 if (!root->fs_info->reloc_ctl ||
924 !root->fs_info->reloc_ctl->create_reloc_root ||
925 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926 return 0;
927
928 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1266 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929 BUG_ON(!root_item); 1267 BUG_ON(!root_item);
930 1268
931 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1269 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932 root_key.type = BTRFS_ROOT_ITEM_KEY; 1270 root_key.type = BTRFS_ROOT_ITEM_KEY;
933 root_key.offset = root->root_key.objectid; 1271 root_key.offset = objectid;
934 1272
935 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1273 if (root->root_key.objectid == objectid) {
936 BTRFS_TREE_RELOC_OBJECTID); 1274 /* called by btrfs_init_reloc_root */
937 BUG_ON(ret); 1275 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1276 BTRFS_TREE_RELOC_OBJECTID);
1277 BUG_ON(ret);
1278
1279 btrfs_set_root_last_snapshot(&root->root_item,
1280 trans->transid - 1);
1281 } else {
1282 /*
1283 * called by btrfs_reloc_post_snapshot_hook.
1284 * the source tree is a reloc tree, all tree blocks
1285 * modified after it was created have RELOC flag
1286 * set in their headers. so it's OK to not update
1287 * the 'last_snapshot'.
1288 */
1289 ret = btrfs_copy_root(trans, root, root->node, &eb,
1290 BTRFS_TREE_RELOC_OBJECTID);
1291 BUG_ON(ret);
1292 }
938 1293
939 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1294 memcpy(root_item, &root->root_item, sizeof(*root_item));
941 btrfs_set_root_refs(root_item, 1);
942 btrfs_set_root_bytenr(root_item, eb->start); 1295 btrfs_set_root_bytenr(root_item, eb->start);
943 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1296 btrfs_set_root_level(root_item, btrfs_header_level(eb));
944 btrfs_set_root_generation(root_item, trans->transid); 1297 btrfs_set_root_generation(root_item, trans->transid);
945 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1298
946 root_item->drop_level = 0; 1299 if (root->root_key.objectid == objectid) {
1300 btrfs_set_root_refs(root_item, 0);
1301 memset(&root_item->drop_progress, 0,
1302 sizeof(struct btrfs_disk_key));
1303 root_item->drop_level = 0;
1304 }
947 1305
948 btrfs_tree_unlock(eb); 1306 btrfs_tree_unlock(eb);
949 free_extent_buffer(eb); 1307 free_extent_buffer(eb);
@@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
957 &root_key); 1315 &root_key);
958 BUG_ON(IS_ERR(reloc_root)); 1316 BUG_ON(IS_ERR(reloc_root));
959 reloc_root->last_trans = trans->transid; 1317 reloc_root->last_trans = trans->transid;
1318 return reloc_root;
1319}
1320
1321/*
1322 * create reloc tree for a given fs tree. reloc tree is just a
1323 * snapshot of the fs tree with special root objectid.
1324 */
1325int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1326 struct btrfs_root *root)
1327{
1328 struct btrfs_root *reloc_root;
1329 struct reloc_control *rc = root->fs_info->reloc_ctl;
1330 int clear_rsv = 0;
1331
1332 if (root->reloc_root) {
1333 reloc_root = root->reloc_root;
1334 reloc_root->last_trans = trans->transid;
1335 return 0;
1336 }
1337
1338 if (!rc || !rc->create_reloc_tree ||
1339 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1340 return 0;
1341
1342 if (!trans->block_rsv) {
1343 trans->block_rsv = rc->block_rsv;
1344 clear_rsv = 1;
1345 }
1346 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1347 if (clear_rsv)
1348 trans->block_rsv = NULL;
960 1349
961 __add_reloc_root(reloc_root); 1350 __add_reloc_root(reloc_root);
962 root->reloc_root = reloc_root; 1351 root->reloc_root = reloc_root;
@@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
980 reloc_root = root->reloc_root; 1369 reloc_root = root->reloc_root;
981 root_item = &reloc_root->root_item; 1370 root_item = &reloc_root->root_item;
982 1371
983 if (btrfs_root_refs(root_item) == 0) { 1372 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1373 btrfs_root_refs(root_item) == 0) {
984 root->reloc_root = NULL; 1374 root->reloc_root = NULL;
985 del = 1; 1375 del = 1;
986 } 1376 }
@@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1102 goto out; 1492 goto out;
1103 } 1493 }
1104 1494
1105 if (new_bytenr) 1495 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107 ret = 0; 1496 ret = 0;
1108out: 1497out:
1109 btrfs_free_path(path); 1498 btrfs_free_path(path);
@@ -1114,19 +1503,18 @@ out:
1114 * update file extent items in the tree leaf to point to 1503 * update file extent items in the tree leaf to point to
1115 * the new locations. 1504 * the new locations.
1116 */ 1505 */
1117static int replace_file_extents(struct btrfs_trans_handle *trans, 1506static noinline_for_stack
1118 struct reloc_control *rc, 1507int replace_file_extents(struct btrfs_trans_handle *trans,
1119 struct btrfs_root *root, 1508 struct reloc_control *rc,
1120 struct extent_buffer *leaf, 1509 struct btrfs_root *root,
1121 struct list_head *inode_list) 1510 struct extent_buffer *leaf)
1122{ 1511{
1123 struct btrfs_key key; 1512 struct btrfs_key key;
1124 struct btrfs_file_extent_item *fi; 1513 struct btrfs_file_extent_item *fi;
1125 struct inode *inode = NULL; 1514 struct inode *inode = NULL;
1126 struct inodevec *ivec = NULL;
1127 u64 parent; 1515 u64 parent;
1128 u64 bytenr; 1516 u64 bytenr;
1129 u64 new_bytenr; 1517 u64 new_bytenr = 0;
1130 u64 num_bytes; 1518 u64 num_bytes;
1131 u64 end; 1519 u64 end;
1132 u32 nritems; 1520 u32 nritems;
@@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1166 * to complete and drop the extent cache 1554 * to complete and drop the extent cache
1167 */ 1555 */
1168 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1556 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171 BUG_ON(!ivec);
1172 ivec->nr = 0;
1173 list_add_tail(&ivec->list, inode_list);
1174 }
1175 if (first) { 1557 if (first) {
1176 inode = find_next_inode(root, key.objectid); 1558 inode = find_next_inode(root, key.objectid);
1177 if (inode)
1178 ivec->inode[ivec->nr++] = inode;
1179 first = 0; 1559 first = 0;
1180 } else if (inode && inode->i_ino < key.objectid) { 1560 } else if (inode && inode->i_ino < key.objectid) {
1561 btrfs_add_delayed_iput(inode);
1181 inode = find_next_inode(root, key.objectid); 1562 inode = find_next_inode(root, key.objectid);
1182 if (inode)
1183 ivec->inode[ivec->nr++] = inode;
1184 } 1563 }
1185 if (inode && inode->i_ino == key.objectid) { 1564 if (inode && inode->i_ino == key.objectid) {
1186 end = key.offset + 1565 end = key.offset +
@@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1204 1583
1205 ret = get_new_location(rc->data_inode, &new_bytenr, 1584 ret = get_new_location(rc->data_inode, &new_bytenr,
1206 bytenr, num_bytes); 1585 bytenr, num_bytes);
1207 if (ret > 0) 1586 if (ret > 0) {
1587 WARN_ON(1);
1208 continue; 1588 continue;
1589 }
1209 BUG_ON(ret < 0); 1590 BUG_ON(ret < 0);
1210 1591
1211 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1592 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1225 } 1606 }
1226 if (dirty) 1607 if (dirty)
1227 btrfs_mark_buffer_dirty(leaf); 1608 btrfs_mark_buffer_dirty(leaf);
1609 if (inode)
1610 btrfs_add_delayed_iput(inode);
1228 return 0; 1611 return 0;
1229} 1612}
1230 1613
@@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1248 * if no block got replaced, 0 is returned. if there are other 1631 * if no block got replaced, 0 is returned. if there are other
1249 * errors, a negative error number is returned. 1632 * errors, a negative error number is returned.
1250 */ 1633 */
1251static int replace_path(struct btrfs_trans_handle *trans, 1634static noinline_for_stack
1252 struct btrfs_root *dest, struct btrfs_root *src, 1635int replace_path(struct btrfs_trans_handle *trans,
1253 struct btrfs_path *path, struct btrfs_key *next_key, 1636 struct btrfs_root *dest, struct btrfs_root *src,
1254 struct extent_buffer **leaf, 1637 struct btrfs_path *path, struct btrfs_key *next_key,
1255 int lowest_level, int max_level) 1638 int lowest_level, int max_level)
1256{ 1639{
1257 struct extent_buffer *eb; 1640 struct extent_buffer *eb;
1258 struct extent_buffer *parent; 1641 struct extent_buffer *parent;
@@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1263 u64 new_ptr_gen; 1646 u64 new_ptr_gen;
1264 u64 last_snapshot; 1647 u64 last_snapshot;
1265 u32 blocksize; 1648 u32 blocksize;
1649 int cow = 0;
1266 int level; 1650 int level;
1267 int ret; 1651 int ret;
1268 int slot; 1652 int slot;
1269 1653
1270 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1654 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272 BUG_ON(lowest_level > 1 && leaf);
1273 1656
1274 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1657 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275 1658again:
1276 slot = path->slots[lowest_level]; 1659 slot = path->slots[lowest_level];
1277 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1660 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278 1661
@@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1286 return 0; 1669 return 0;
1287 } 1670 }
1288 1671
1289 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1672 if (cow) {
1290 BUG_ON(ret); 1673 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1674 BUG_ON(ret);
1675 }
1291 btrfs_set_lock_blocking(eb); 1676 btrfs_set_lock_blocking(eb);
1292 1677
1293 if (next_key) { 1678 if (next_key) {
@@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1331 1716
1332 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1717 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333 memcmp_node_keys(parent, slot, path, level)) { 1718 memcmp_node_keys(parent, slot, path, level)) {
1334 if (level <= lowest_level && !leaf) { 1719 if (level <= lowest_level) {
1335 ret = 0; 1720 ret = 0;
1336 break; 1721 break;
1337 } 1722 }
@@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1339 eb = read_tree_block(dest, old_bytenr, blocksize, 1724 eb = read_tree_block(dest, old_bytenr, blocksize,
1340 old_ptr_gen); 1725 old_ptr_gen);
1341 btrfs_tree_lock(eb); 1726 btrfs_tree_lock(eb);
1342 ret = btrfs_cow_block(trans, dest, eb, parent, 1727 if (cow) {
1343 slot, &eb); 1728 ret = btrfs_cow_block(trans, dest, eb, parent,
1344 BUG_ON(ret); 1729 slot, &eb);
1345 btrfs_set_lock_blocking(eb); 1730 BUG_ON(ret);
1346
1347 if (level <= lowest_level) {
1348 *leaf = eb;
1349 ret = 0;
1350 break;
1351 } 1731 }
1732 btrfs_set_lock_blocking(eb);
1352 1733
1353 btrfs_tree_unlock(parent); 1734 btrfs_tree_unlock(parent);
1354 free_extent_buffer(parent); 1735 free_extent_buffer(parent);
@@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1357 continue; 1738 continue;
1358 } 1739 }
1359 1740
1741 if (!cow) {
1742 btrfs_tree_unlock(parent);
1743 free_extent_buffer(parent);
1744 cow = 1;
1745 goto again;
1746 }
1747
1360 btrfs_node_key_to_cpu(path->nodes[level], &key, 1748 btrfs_node_key_to_cpu(path->nodes[level], &key,
1361 path->slots[level]); 1749 path->slots[level]);
1362 btrfs_release_path(src, path); 1750 btrfs_release_path(src, path);
@@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1562 return 0; 1950 return 0;
1563} 1951}
1564 1952
1565static void put_inodes(struct list_head *list)
1566{
1567 struct inodevec *ivec;
1568 while (!list_empty(list)) {
1569 ivec = list_entry(list->next, struct inodevec, list);
1570 list_del(&ivec->list);
1571 while (ivec->nr > 0) {
1572 ivec->nr--;
1573 iput(ivec->inode[ivec->nr]);
1574 }
1575 kfree(ivec);
1576 }
1577}
1578
1579static int find_next_key(struct btrfs_path *path, int level, 1953static int find_next_key(struct btrfs_path *path, int level,
1580 struct btrfs_key *key) 1954 struct btrfs_key *key)
1581 1955
@@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1608 struct btrfs_root *reloc_root; 1982 struct btrfs_root *reloc_root;
1609 struct btrfs_root_item *root_item; 1983 struct btrfs_root_item *root_item;
1610 struct btrfs_path *path; 1984 struct btrfs_path *path;
1611 struct extent_buffer *leaf = NULL; 1985 struct extent_buffer *leaf;
1612 unsigned long nr; 1986 unsigned long nr;
1613 int level; 1987 int level;
1614 int max_level; 1988 int max_level;
1615 int replaced = 0; 1989 int replaced = 0;
1616 int ret; 1990 int ret;
1617 int err = 0; 1991 int err = 0;
1992 u32 min_reserved;
1618 1993
1619 path = btrfs_alloc_path(); 1994 path = btrfs_alloc_path();
1620 if (!path) 1995 if (!path)
@@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1648 btrfs_unlock_up_safe(path, 0); 2023 btrfs_unlock_up_safe(path, 0);
1649 } 2024 }
1650 2025
1651 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2026 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1652 trans = btrfs_start_transaction(root, 1); 2027 memset(&next_key, 0, sizeof(next_key));
1653 2028
1654 leaf = path->nodes[0]; 2029 while (1) {
1655 btrfs_item_key_to_cpu(leaf, &key, 0); 2030 trans = btrfs_start_transaction(root, 0);
1656 btrfs_release_path(reloc_root, path); 2031 trans->block_rsv = rc->block_rsv;
1657 2032
1658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1659 if (ret < 0) { 2034 min_reserved, 0);
1660 err = ret; 2035 if (ret) {
1661 goto out; 2036 BUG_ON(ret != -EAGAIN);
2037 ret = btrfs_commit_transaction(trans, root);
2038 BUG_ON(ret);
2039 continue;
1662 } 2040 }
1663 2041
1664 leaf = path->nodes[0];
1665 btrfs_unlock_up_safe(path, 1);
1666 ret = replace_file_extents(trans, rc, root, leaf,
1667 &inode_list);
1668 if (ret < 0)
1669 err = ret;
1670 goto out;
1671 }
1672
1673 memset(&next_key, 0, sizeof(next_key));
1674
1675 while (1) {
1676 leaf = NULL;
1677 replaced = 0; 2042 replaced = 0;
1678 trans = btrfs_start_transaction(root, 1);
1679 max_level = level; 2043 max_level = level;
1680 2044
1681 ret = walk_down_reloc_tree(reloc_root, path, &level); 2045 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1689 if (!find_next_key(path, level, &key) && 2053 if (!find_next_key(path, level, &key) &&
1690 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2054 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691 ret = 0; 2055 ret = 0;
1692 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693 ret = replace_path(trans, root, reloc_root,
1694 path, &next_key, &leaf,
1695 level, max_level);
1696 } else { 2056 } else {
1697 ret = replace_path(trans, root, reloc_root, 2057 ret = replace_path(trans, root, reloc_root, path,
1698 path, &next_key, NULL, 2058 &next_key, level, max_level);
1699 level, max_level);
1700 } 2059 }
1701 if (ret < 0) { 2060 if (ret < 0) {
1702 err = ret; 2061 err = ret;
@@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1708 btrfs_node_key_to_cpu(path->nodes[level], &key, 2067 btrfs_node_key_to_cpu(path->nodes[level], &key,
1709 path->slots[level]); 2068 path->slots[level]);
1710 replaced = 1; 2069 replaced = 1;
1711 } else if (leaf) {
1712 /*
1713 * no block got replaced, try replacing file extents
1714 */
1715 btrfs_item_key_to_cpu(leaf, &key, 0);
1716 ret = replace_file_extents(trans, rc, root, leaf,
1717 &inode_list);
1718 btrfs_tree_unlock(leaf);
1719 free_extent_buffer(leaf);
1720 BUG_ON(ret < 0);
1721 } 2070 }
1722 2071
1723 ret = walk_up_reloc_tree(reloc_root, path, &level); 2072 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1734 root_item->drop_level = level; 2083 root_item->drop_level = level;
1735 2084
1736 nr = trans->blocks_used; 2085 nr = trans->blocks_used;
1737 btrfs_end_transaction(trans, root); 2086 btrfs_end_transaction_throttle(trans, root);
1738 2087
1739 btrfs_btree_balance_dirty(root, nr); 2088 btrfs_btree_balance_dirty(root, nr);
1740 2089
1741 /*
1742 * put inodes outside transaction, otherwise we may deadlock.
1743 */
1744 put_inodes(&inode_list);
1745
1746 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2090 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747 invalidate_extent_cache(root, &key, &next_key); 2091 invalidate_extent_cache(root, &key, &next_key);
1748 } 2092 }
@@ -1765,87 +2109,125 @@ out:
1765 sizeof(root_item->drop_progress)); 2109 sizeof(root_item->drop_progress));
1766 root_item->drop_level = 0; 2110 root_item->drop_level = 0;
1767 btrfs_set_root_refs(root_item, 0); 2111 btrfs_set_root_refs(root_item, 0);
2112 btrfs_update_reloc_root(trans, root);
1768 } 2113 }
1769 2114
1770 nr = trans->blocks_used; 2115 nr = trans->blocks_used;
1771 btrfs_end_transaction(trans, root); 2116 btrfs_end_transaction_throttle(trans, root);
1772 2117
1773 btrfs_btree_balance_dirty(root, nr); 2118 btrfs_btree_balance_dirty(root, nr);
1774 2119
1775 put_inodes(&inode_list);
1776
1777 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2120 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778 invalidate_extent_cache(root, &key, &next_key); 2121 invalidate_extent_cache(root, &key, &next_key);
1779 2122
1780 return err; 2123 return err;
1781} 2124}
1782 2125
1783/* 2126static noinline_for_stack
1784 * callback for the work threads. 2127int prepare_to_merge(struct reloc_control *rc, int err)
1785 * this function merges reloc tree with corresponding fs tree,
1786 * and then drops the reloc tree.
1787 */
1788static void merge_func(struct btrfs_work *work)
1789{ 2128{
1790 struct btrfs_trans_handle *trans; 2129 struct btrfs_root *root = rc->extent_root;
1791 struct btrfs_root *root;
1792 struct btrfs_root *reloc_root; 2130 struct btrfs_root *reloc_root;
1793 struct async_merge *async; 2131 struct btrfs_trans_handle *trans;
2132 LIST_HEAD(reloc_roots);
2133 u64 num_bytes = 0;
2134 int ret;
2135 int retries = 0;
2136
2137 mutex_lock(&root->fs_info->trans_mutex);
2138 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2139 rc->merging_rsv_size += rc->nodes_relocated * 2;
2140 mutex_unlock(&root->fs_info->trans_mutex);
2141again:
2142 if (!err) {
2143 num_bytes = rc->merging_rsv_size;
2144 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2145 num_bytes, &retries);
2146 if (ret)
2147 err = ret;
2148 }
2149
2150 trans = btrfs_join_transaction(rc->extent_root, 1);
2151
2152 if (!err) {
2153 if (num_bytes != rc->merging_rsv_size) {
2154 btrfs_end_transaction(trans, rc->extent_root);
2155 btrfs_block_rsv_release(rc->extent_root,
2156 rc->block_rsv, num_bytes);
2157 retries = 0;
2158 goto again;
2159 }
2160 }
1794 2161
1795 async = container_of(work, struct async_merge, work); 2162 rc->merge_reloc_tree = 1;
1796 reloc_root = async->root; 2163
2164 while (!list_empty(&rc->reloc_roots)) {
2165 reloc_root = list_entry(rc->reloc_roots.next,
2166 struct btrfs_root, root_list);
2167 list_del_init(&reloc_root->root_list);
1797 2168
1798 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799 root = read_fs_root(reloc_root->fs_info, 2169 root = read_fs_root(reloc_root->fs_info,
1800 reloc_root->root_key.offset); 2170 reloc_root->root_key.offset);
1801 BUG_ON(IS_ERR(root)); 2171 BUG_ON(IS_ERR(root));
1802 BUG_ON(root->reloc_root != reloc_root); 2172 BUG_ON(root->reloc_root != reloc_root);
1803 2173
1804 merge_reloc_root(async->rc, root); 2174 /*
1805 2175 * set reference count to 1, so btrfs_recover_relocation
1806 trans = btrfs_start_transaction(root, 1); 2176 * knows it should resumes merging
2177 */
2178 if (!err)
2179 btrfs_set_root_refs(&reloc_root->root_item, 1);
1807 btrfs_update_reloc_root(trans, root); 2180 btrfs_update_reloc_root(trans, root);
1808 btrfs_end_transaction(trans, root);
1809 }
1810 2181
1811 btrfs_drop_snapshot(reloc_root, 0); 2182 list_add(&reloc_root->root_list, &reloc_roots);
2183 }
1812 2184
1813 if (atomic_dec_and_test(async->num_pending)) 2185 list_splice(&reloc_roots, &rc->reloc_roots);
1814 complete(async->done);
1815 2186
1816 kfree(async); 2187 if (!err)
2188 btrfs_commit_transaction(trans, rc->extent_root);
2189 else
2190 btrfs_end_transaction(trans, rc->extent_root);
2191 return err;
1817} 2192}
1818 2193
1819static int merge_reloc_roots(struct reloc_control *rc) 2194static noinline_for_stack
2195int merge_reloc_roots(struct reloc_control *rc)
1820{ 2196{
1821 struct async_merge *async;
1822 struct btrfs_root *root; 2197 struct btrfs_root *root;
1823 struct completion done; 2198 struct btrfs_root *reloc_root;
1824 atomic_t num_pending; 2199 LIST_HEAD(reloc_roots);
2200 int found = 0;
2201 int ret;
2202again:
2203 root = rc->extent_root;
2204 mutex_lock(&root->fs_info->trans_mutex);
2205 list_splice_init(&rc->reloc_roots, &reloc_roots);
2206 mutex_unlock(&root->fs_info->trans_mutex);
1825 2207
1826 init_completion(&done); 2208 while (!list_empty(&reloc_roots)) {
1827 atomic_set(&num_pending, 1); 2209 found = 1;
2210 reloc_root = list_entry(reloc_roots.next,
2211 struct btrfs_root, root_list);
1828 2212
1829 while (!list_empty(&rc->reloc_roots)) { 2213 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1830 root = list_entry(rc->reloc_roots.next, 2214 root = read_fs_root(reloc_root->fs_info,
1831 struct btrfs_root, root_list); 2215 reloc_root->root_key.offset);
1832 list_del_init(&root->root_list); 2216 BUG_ON(IS_ERR(root));
2217 BUG_ON(root->reloc_root != reloc_root);
1833 2218
1834 async = kmalloc(sizeof(*async), GFP_NOFS); 2219 ret = merge_reloc_root(rc, root);
1835 BUG_ON(!async); 2220 BUG_ON(ret);
1836 async->work.func = merge_func; 2221 } else {
1837 async->work.flags = 0; 2222 list_del_init(&reloc_root->root_list);
1838 async->rc = rc; 2223 }
1839 async->root = root; 2224 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1840 async->done = &done;
1841 async->num_pending = &num_pending;
1842 atomic_inc(&num_pending);
1843 btrfs_queue_worker(&rc->workers, &async->work);
1844 } 2225 }
1845 2226
1846 if (!atomic_dec_and_test(&num_pending)) 2227 if (found) {
1847 wait_for_completion(&done); 2228 found = 0;
1848 2229 goto again;
2230 }
1849 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2231 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850 return 0; 2232 return 0;
1851} 2233}
@@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1876 return btrfs_record_root_in_trans(trans, root); 2258 return btrfs_record_root_in_trans(trans, root);
1877} 2259}
1878 2260
1879/* 2261static noinline_for_stack
1880 * select one tree from trees that references the block. 2262struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1881 * for blocks in refernce counted trees, we preper reloc tree. 2263 struct reloc_control *rc,
1882 * if no reloc tree found and reloc_only is true, NULL is returned. 2264 struct backref_node *node,
1883 */ 2265 struct backref_edge *edges[], int *nr)
1884static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885 struct backref_node *node,
1886 struct backref_edge *edges[],
1887 int *nr, int reloc_only)
1888{ 2266{
1889 struct backref_node *next; 2267 struct backref_node *next;
1890 struct btrfs_root *root; 2268 struct btrfs_root *root;
1891 int index; 2269 int index = 0;
1892 int loop = 0; 2270
1893again:
1894 index = 0;
1895 next = node; 2271 next = node;
1896 while (1) { 2272 while (1) {
1897 cond_resched(); 2273 cond_resched();
1898 next = walk_up_backref(next, edges, &index); 2274 next = walk_up_backref(next, edges, &index);
1899 root = next->root; 2275 root = next->root;
1900 if (!root) { 2276 BUG_ON(!root);
1901 BUG_ON(!node->old_root); 2277 BUG_ON(!root->ref_cows);
1902 goto skip;
1903 }
1904
1905 /* no other choice for non-refernce counted tree */
1906 if (!root->ref_cows) {
1907 BUG_ON(reloc_only);
1908 break;
1909 }
1910 2278
1911 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2279 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912 record_reloc_root_in_trans(trans, root); 2280 record_reloc_root_in_trans(trans, root);
1913 break; 2281 break;
1914 } 2282 }
1915 2283
1916 if (loop) { 2284 btrfs_record_root_in_trans(trans, root);
1917 btrfs_record_root_in_trans(trans, root); 2285 root = root->reloc_root;
2286
2287 if (next->new_bytenr != root->node->start) {
2288 BUG_ON(next->new_bytenr);
2289 BUG_ON(!list_empty(&next->list));
2290 next->new_bytenr = root->node->start;
2291 next->root = root;
2292 list_add_tail(&next->list,
2293 &rc->backref_cache.changed);
2294 __mark_block_processed(rc, next);
1918 break; 2295 break;
1919 } 2296 }
1920 2297
1921 if (reloc_only || next != node) { 2298 WARN_ON(1);
1922 if (!root->reloc_root)
1923 btrfs_record_root_in_trans(trans, root);
1924 root = root->reloc_root;
1925 /*
1926 * if the reloc tree was created in current
1927 * transation, there is no node in backref tree
1928 * corresponds to the root of the reloc tree.
1929 */
1930 if (btrfs_root_last_snapshot(&root->root_item) ==
1931 trans->transid - 1)
1932 break;
1933 }
1934skip:
1935 root = NULL; 2299 root = NULL;
1936 next = walk_down_backref(edges, &index); 2300 next = walk_down_backref(edges, &index);
1937 if (!next || next->level <= node->level) 2301 if (!next || next->level <= node->level)
1938 break; 2302 break;
1939 } 2303 }
2304 if (!root)
2305 return NULL;
1940 2306
1941 if (!root && !loop && !reloc_only) { 2307 *nr = index;
1942 loop = 1; 2308 next = node;
1943 goto again; 2309 /* setup backref node path for btrfs_reloc_cow_block */
2310 while (1) {
2311 rc->backref_cache.path[next->level] = next;
2312 if (--index < 0)
2313 break;
2314 next = edges[index]->node[UPPER];
1944 } 2315 }
1945
1946 if (root)
1947 *nr = index;
1948 else
1949 *nr = 0;
1950
1951 return root; 2316 return root;
1952} 2317}
1953 2318
2319/*
2320 * select a tree root for relocation. return NULL if the block
2321 * is reference counted. we should use do_relocation() in this
2322 * case. return a tree root pointer if the block isn't reference
2323 * counted. return -ENOENT if the block is root of reloc tree.
2324 */
1954static noinline_for_stack 2325static noinline_for_stack
1955struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2326struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956 struct backref_node *node) 2327 struct backref_node *node)
1957{ 2328{
2329 struct backref_node *next;
2330 struct btrfs_root *root;
2331 struct btrfs_root *fs_root = NULL;
1958 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2332 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959 int nr; 2333 int index = 0;
1960 return __select_one_root(trans, node, edges, &nr, 0); 2334
2335 next = node;
2336 while (1) {
2337 cond_resched();
2338 next = walk_up_backref(next, edges, &index);
2339 root = next->root;
2340 BUG_ON(!root);
2341
2342 /* no other choice for non-refernce counted tree */
2343 if (!root->ref_cows)
2344 return root;
2345
2346 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2347 fs_root = root;
2348
2349 if (next != node)
2350 return NULL;
2351
2352 next = walk_down_backref(edges, &index);
2353 if (!next || next->level <= node->level)
2354 break;
2355 }
2356
2357 if (!fs_root)
2358 return ERR_PTR(-ENOENT);
2359 return fs_root;
1961} 2360}
1962 2361
1963static noinline_for_stack 2362static noinline_for_stack
1964struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2363u64 calcu_metadata_size(struct reloc_control *rc,
1965 struct backref_node *node, 2364 struct backref_node *node, int reserve)
1966 struct backref_edge *edges[], int *nr)
1967{ 2365{
1968 return __select_one_root(trans, node, edges, nr, 1); 2366 struct backref_node *next = node;
2367 struct backref_edge *edge;
2368 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2369 u64 num_bytes = 0;
2370 int index = 0;
2371
2372 BUG_ON(reserve && node->processed);
2373
2374 while (next) {
2375 cond_resched();
2376 while (1) {
2377 if (next->processed && (reserve || next != node))
2378 break;
2379
2380 num_bytes += btrfs_level_size(rc->extent_root,
2381 next->level);
2382
2383 if (list_empty(&next->upper))
2384 break;
2385
2386 edge = list_entry(next->upper.next,
2387 struct backref_edge, list[LOWER]);
2388 edges[index++] = edge;
2389 next = edge->node[UPPER];
2390 }
2391 next = walk_down_backref(edges, &index);
2392 }
2393 return num_bytes;
1969} 2394}
1970 2395
1971static void grab_path_buffers(struct btrfs_path *path, 2396static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1972 struct backref_node *node, 2397 struct reloc_control *rc,
1973 struct backref_edge *edges[], int nr) 2398 struct backref_node *node)
1974{ 2399{
1975 int i = 0; 2400 struct btrfs_root *root = rc->extent_root;
1976 while (1) { 2401 u64 num_bytes;
1977 drop_node_buffer(node); 2402 int ret;
1978 node->eb = path->nodes[node->level]; 2403
1979 BUG_ON(!node->eb); 2404 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1980 if (path->locks[node->level])
1981 node->locked = 1;
1982 path->nodes[node->level] = NULL;
1983 path->locks[node->level] = 0;
1984
1985 if (i >= nr)
1986 break;
1987 2405
1988 edges[i]->blockptr = node->eb->start; 2406 trans->block_rsv = rc->block_rsv;
1989 node = edges[i]->node[UPPER]; 2407 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1990 i++; 2408 &rc->block_rsv_retries);
2409 if (ret) {
2410 if (ret == -EAGAIN)
2411 rc->commit_transaction = 1;
2412 return ret;
1991 } 2413 }
2414
2415 rc->block_rsv_retries = 0;
2416 return 0;
2417}
2418
2419static void release_metadata_space(struct reloc_control *rc,
2420 struct backref_node *node)
2421{
2422 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2423 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1992} 2424}
1993 2425
1994/* 2426/*
@@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1999 * in that case this function just updates pointers. 2431 * in that case this function just updates pointers.
2000 */ 2432 */
2001static int do_relocation(struct btrfs_trans_handle *trans, 2433static int do_relocation(struct btrfs_trans_handle *trans,
2434 struct reloc_control *rc,
2002 struct backref_node *node, 2435 struct backref_node *node,
2003 struct btrfs_key *key, 2436 struct btrfs_key *key,
2004 struct btrfs_path *path, int lowest) 2437 struct btrfs_path *path, int lowest)
@@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2019 BUG_ON(lowest && node->eb); 2452 BUG_ON(lowest && node->eb);
2020 2453
2021 path->lowest_level = node->level + 1; 2454 path->lowest_level = node->level + 1;
2455 rc->backref_cache.path[node->level] = node;
2022 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2456 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023 cond_resched(); 2457 cond_resched();
2024 if (node->eb && node->eb->start == edge->blockptr)
2025 continue;
2026 2458
2027 upper = edge->node[UPPER]; 2459 upper = edge->node[UPPER];
2028 root = select_reloc_root(trans, upper, edges, &nr); 2460 root = select_reloc_root(trans, rc, upper, edges, &nr);
2029 if (!root) 2461 BUG_ON(!root);
2030 continue; 2462
2031 2463 if (upper->eb && !upper->locked) {
2032 if (upper->eb && !upper->locked) 2464 if (!lowest) {
2465 ret = btrfs_bin_search(upper->eb, key,
2466 upper->level, &slot);
2467 BUG_ON(ret);
2468 bytenr = btrfs_node_blockptr(upper->eb, slot);
2469 if (node->eb->start == bytenr)
2470 goto next;
2471 }
2033 drop_node_buffer(upper); 2472 drop_node_buffer(upper);
2473 }
2034 2474
2035 if (!upper->eb) { 2475 if (!upper->eb) {
2036 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2476 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2040 } 2480 }
2041 BUG_ON(ret > 0); 2481 BUG_ON(ret > 0);
2042 2482
2043 slot = path->slots[upper->level]; 2483 if (!upper->eb) {
2484 upper->eb = path->nodes[upper->level];
2485 path->nodes[upper->level] = NULL;
2486 } else {
2487 BUG_ON(upper->eb != path->nodes[upper->level]);
2488 }
2044 2489
2045 btrfs_unlock_up_safe(path, upper->level + 1); 2490 upper->locked = 1;
2046 grab_path_buffers(path, upper, edges, nr); 2491 path->locks[upper->level] = 0;
2047 2492
2493 slot = path->slots[upper->level];
2048 btrfs_release_path(NULL, path); 2494 btrfs_release_path(NULL, path);
2049 } else { 2495 } else {
2050 ret = btrfs_bin_search(upper->eb, key, upper->level, 2496 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2053 } 2499 }
2054 2500
2055 bytenr = btrfs_node_blockptr(upper->eb, slot); 2501 bytenr = btrfs_node_blockptr(upper->eb, slot);
2056 if (!lowest) { 2502 if (lowest) {
2057 if (node->eb->start == bytenr) { 2503 BUG_ON(bytenr != node->bytenr);
2058 btrfs_tree_unlock(upper->eb);
2059 upper->locked = 0;
2060 continue;
2061 }
2062 } else { 2504 } else {
2063 BUG_ON(node->bytenr != bytenr); 2505 if (node->eb->start == bytenr)
2506 goto next;
2064 } 2507 }
2065 2508
2066 blocksize = btrfs_level_size(root, node->level); 2509 blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2072 if (!node->eb) { 2515 if (!node->eb) {
2073 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2516 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074 slot, &eb); 2517 slot, &eb);
2518 btrfs_tree_unlock(eb);
2519 free_extent_buffer(eb);
2075 if (ret < 0) { 2520 if (ret < 0) {
2076 err = ret; 2521 err = ret;
2077 break; 2522 goto next;
2078 } 2523 }
2079 btrfs_set_lock_blocking(eb); 2524 BUG_ON(node->eb != eb);
2080 node->eb = eb;
2081 node->locked = 1;
2082 } else { 2525 } else {
2083 btrfs_set_node_blockptr(upper->eb, slot, 2526 btrfs_set_node_blockptr(upper->eb, slot,
2084 node->eb->start); 2527 node->eb->start);
@@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2096 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2539 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097 BUG_ON(ret); 2540 BUG_ON(ret);
2098 } 2541 }
2099 if (!lowest) { 2542next:
2100 btrfs_tree_unlock(upper->eb); 2543 if (!upper->pending)
2101 upper->locked = 0; 2544 drop_node_buffer(upper);
2102 } 2545 else
2546 unlock_node_buffer(upper);
2547 if (err)
2548 break;
2103 } 2549 }
2550
2551 if (!err && node->pending) {
2552 drop_node_buffer(node);
2553 list_move_tail(&node->list, &rc->backref_cache.changed);
2554 node->pending = 0;
2555 }
2556
2104 path->lowest_level = 0; 2557 path->lowest_level = 0;
2558 BUG_ON(err == -ENOSPC);
2105 return err; 2559 return err;
2106} 2560}
2107 2561
2108static int link_to_upper(struct btrfs_trans_handle *trans, 2562static int link_to_upper(struct btrfs_trans_handle *trans,
2563 struct reloc_control *rc,
2109 struct backref_node *node, 2564 struct backref_node *node,
2110 struct btrfs_path *path) 2565 struct btrfs_path *path)
2111{ 2566{
2112 struct btrfs_key key; 2567 struct btrfs_key key;
2113 if (!node->eb || list_empty(&node->upper))
2114 return 0;
2115 2568
2116 btrfs_node_key_to_cpu(node->eb, &key, 0); 2569 btrfs_node_key_to_cpu(node->eb, &key, 0);
2117 return do_relocation(trans, node, &key, path, 0); 2570 return do_relocation(trans, rc, node, &key, path, 0);
2118} 2571}
2119 2572
2120static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2573static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121 struct backref_cache *cache, 2574 struct reloc_control *rc,
2122 struct btrfs_path *path) 2575 struct btrfs_path *path, int err)
2123{ 2576{
2577 LIST_HEAD(list);
2578 struct backref_cache *cache = &rc->backref_cache;
2124 struct backref_node *node; 2579 struct backref_node *node;
2125 int level; 2580 int level;
2126 int ret; 2581 int ret;
2127 int err = 0;
2128 2582
2129 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2583 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130 while (!list_empty(&cache->pending[level])) { 2584 while (!list_empty(&cache->pending[level])) {
2131 node = list_entry(cache->pending[level].next, 2585 node = list_entry(cache->pending[level].next,
2132 struct backref_node, lower); 2586 struct backref_node, list);
2133 BUG_ON(node->level != level); 2587 list_move_tail(&node->list, &list);
2588 BUG_ON(!node->pending);
2134 2589
2135 ret = link_to_upper(trans, node, path); 2590 if (!err) {
2136 if (ret < 0) 2591 ret = link_to_upper(trans, rc, node, path);
2137 err = ret; 2592 if (ret < 0)
2138 /* 2593 err = ret;
2139 * this remove the node from the pending list and 2594 }
2140 * may add some other nodes to the level + 1
2141 * pending list
2142 */
2143 remove_backref_node(cache, node);
2144 } 2595 }
2596 list_splice_init(&list, &cache->pending[level]);
2145 } 2597 }
2146 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147 return err; 2598 return err;
2148} 2599}
2149 2600
2150static void mark_block_processed(struct reloc_control *rc, 2601static void mark_block_processed(struct reloc_control *rc,
2151 struct backref_node *node) 2602 u64 bytenr, u32 blocksize)
2603{
2604 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2605 EXTENT_DIRTY, GFP_NOFS);
2606}
2607
2608static void __mark_block_processed(struct reloc_control *rc,
2609 struct backref_node *node)
2152{ 2610{
2153 u32 blocksize; 2611 u32 blocksize;
2154 if (node->level == 0 || 2612 if (node->level == 0 ||
2155 in_block_group(node->bytenr, rc->block_group)) { 2613 in_block_group(node->bytenr, rc->block_group)) {
2156 blocksize = btrfs_level_size(rc->extent_root, node->level); 2614 blocksize = btrfs_level_size(rc->extent_root, node->level);
2157 set_extent_bits(&rc->processed_blocks, node->bytenr, 2615 mark_block_processed(rc, node->bytenr, blocksize);
2158 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159 GFP_NOFS);
2160 } 2616 }
2161 node->processed = 1; 2617 node->processed = 1;
2162} 2618}
@@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2179 if (next->processed) 2635 if (next->processed)
2180 break; 2636 break;
2181 2637
2182 mark_block_processed(rc, next); 2638 __mark_block_processed(rc, next);
2183 2639
2184 if (list_empty(&next->upper)) 2640 if (list_empty(&next->upper))
2185 break; 2641 break;
@@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2202 return 0; 2658 return 0;
2203} 2659}
2204 2660
2205/*
2206 * check if there are any file extent pointers in the leaf point to
2207 * data require processing
2208 */
2209static int check_file_extents(struct reloc_control *rc,
2210 u64 bytenr, u32 blocksize, u64 ptr_gen)
2211{
2212 struct btrfs_key found_key;
2213 struct btrfs_file_extent_item *fi;
2214 struct extent_buffer *leaf;
2215 u32 nritems;
2216 int i;
2217 int ret = 0;
2218
2219 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220
2221 nritems = btrfs_header_nritems(leaf);
2222 for (i = 0; i < nritems; i++) {
2223 cond_resched();
2224 btrfs_item_key_to_cpu(leaf, &found_key, i);
2225 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226 continue;
2227 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228 if (btrfs_file_extent_type(leaf, fi) ==
2229 BTRFS_FILE_EXTENT_INLINE)
2230 continue;
2231 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232 if (bytenr == 0)
2233 continue;
2234 if (in_block_group(bytenr, rc->block_group)) {
2235 ret = 1;
2236 break;
2237 }
2238 }
2239 free_extent_buffer(leaf);
2240 return ret;
2241}
2242
2243/*
2244 * scan child blocks of a given block to find blocks require processing
2245 */
2246static int add_child_blocks(struct btrfs_trans_handle *trans,
2247 struct reloc_control *rc,
2248 struct backref_node *node,
2249 struct rb_root *blocks)
2250{
2251 struct tree_block *block;
2252 struct rb_node *rb_node;
2253 u64 bytenr;
2254 u64 ptr_gen;
2255 u32 blocksize;
2256 u32 nritems;
2257 int i;
2258 int err = 0;
2259
2260 nritems = btrfs_header_nritems(node->eb);
2261 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262 for (i = 0; i < nritems; i++) {
2263 cond_resched();
2264 bytenr = btrfs_node_blockptr(node->eb, i);
2265 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266 if (ptr_gen == trans->transid)
2267 continue;
2268 if (!in_block_group(bytenr, rc->block_group) &&
2269 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270 continue;
2271 if (tree_block_processed(bytenr, blocksize, rc))
2272 continue;
2273
2274 readahead_tree_block(rc->extent_root,
2275 bytenr, blocksize, ptr_gen);
2276 }
2277
2278 for (i = 0; i < nritems; i++) {
2279 cond_resched();
2280 bytenr = btrfs_node_blockptr(node->eb, i);
2281 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282 if (ptr_gen == trans->transid)
2283 continue;
2284 if (!in_block_group(bytenr, rc->block_group) &&
2285 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286 continue;
2287 if (tree_block_processed(bytenr, blocksize, rc))
2288 continue;
2289 if (!in_block_group(bytenr, rc->block_group) &&
2290 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291 continue;
2292
2293 block = kmalloc(sizeof(*block), GFP_NOFS);
2294 if (!block) {
2295 err = -ENOMEM;
2296 break;
2297 }
2298 block->bytenr = bytenr;
2299 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300 block->level = node->level - 1;
2301 block->key_ready = 1;
2302 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303 BUG_ON(rb_node);
2304 }
2305 if (err)
2306 free_block_list(blocks);
2307 return err;
2308}
2309
2310/*
2311 * find adjacent blocks require processing
2312 */
2313static noinline_for_stack
2314int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315 struct reloc_control *rc,
2316 struct backref_cache *cache,
2317 struct rb_root *blocks, int level,
2318 struct backref_node **upper)
2319{
2320 struct backref_node *node;
2321 int ret = 0;
2322
2323 WARN_ON(!list_empty(&cache->pending[level]));
2324
2325 if (list_empty(&cache->pending[level + 1]))
2326 return 1;
2327
2328 node = list_entry(cache->pending[level + 1].next,
2329 struct backref_node, lower);
2330 if (node->eb)
2331 ret = add_child_blocks(trans, rc, node, blocks);
2332
2333 *upper = node;
2334 return ret;
2335}
2336
2337static int get_tree_block_key(struct reloc_control *rc, 2661static int get_tree_block_key(struct reloc_control *rc,
2338 struct tree_block *block) 2662 struct tree_block *block)
2339{ 2663{
@@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2371 struct btrfs_path *path) 2695 struct btrfs_path *path)
2372{ 2696{
2373 struct btrfs_root *root; 2697 struct btrfs_root *root;
2374 int ret; 2698 int release = 0;
2699 int ret = 0;
2375 2700
2701 if (!node)
2702 return 0;
2703
2704 BUG_ON(node->processed);
2376 root = select_one_root(trans, node); 2705 root = select_one_root(trans, node);
2377 if (unlikely(!root)) { 2706 if (root == ERR_PTR(-ENOENT)) {
2378 rc->found_old_snapshot = 1;
2379 update_processed_blocks(rc, node); 2707 update_processed_blocks(rc, node);
2380 return 0; 2708 goto out;
2381 } 2709 }
2382 2710
2383 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2711 if (!root || root->ref_cows) {
2384 ret = do_relocation(trans, node, key, path, 1); 2712 ret = reserve_metadata_space(trans, rc, node);
2385 if (ret < 0) 2713 if (ret)
2386 goto out;
2387 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388 ret = replace_file_extents(trans, rc, root,
2389 node->eb, NULL);
2390 if (ret < 0)
2391 goto out;
2392 }
2393 drop_node_buffer(node);
2394 } else if (!root->ref_cows) {
2395 path->lowest_level = node->level;
2396 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397 btrfs_release_path(root, path);
2398 if (ret < 0)
2399 goto out; 2714 goto out;
2400 } else if (root != node->root) { 2715 release = 1;
2401 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402 } 2716 }
2403 2717
2404 update_processed_blocks(rc, node); 2718 if (root) {
2405 ret = 0; 2719 if (root->ref_cows) {
2720 BUG_ON(node->new_bytenr);
2721 BUG_ON(!list_empty(&node->list));
2722 btrfs_record_root_in_trans(trans, root);
2723 root = root->reloc_root;
2724 node->new_bytenr = root->node->start;
2725 node->root = root;
2726 list_add_tail(&node->list, &rc->backref_cache.changed);
2727 } else {
2728 path->lowest_level = node->level;
2729 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2730 btrfs_release_path(root, path);
2731 if (ret > 0)
2732 ret = 0;
2733 }
2734 if (!ret)
2735 update_processed_blocks(rc, node);
2736 } else {
2737 ret = do_relocation(trans, rc, node, key, path, 1);
2738 }
2406out: 2739out:
2407 drop_node_buffer(node); 2740 if (ret || node->level == 0 || node->cowonly) {
2741 if (release)
2742 release_metadata_space(rc, node);
2743 remove_backref_node(&rc->backref_cache, node);
2744 }
2408 return ret; 2745 return ret;
2409} 2746}
2410 2747
@@ -2415,12 +2752,10 @@ static noinline_for_stack
2415int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2752int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416 struct reloc_control *rc, struct rb_root *blocks) 2753 struct reloc_control *rc, struct rb_root *blocks)
2417{ 2754{
2418 struct backref_cache *cache;
2419 struct backref_node *node; 2755 struct backref_node *node;
2420 struct btrfs_path *path; 2756 struct btrfs_path *path;
2421 struct tree_block *block; 2757 struct tree_block *block;
2422 struct rb_node *rb_node; 2758 struct rb_node *rb_node;
2423 int level = -1;
2424 int ret; 2759 int ret;
2425 int err = 0; 2760 int err = 0;
2426 2761
@@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2428 if (!path) 2763 if (!path)
2429 return -ENOMEM; 2764 return -ENOMEM;
2430 2765
2431 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432 if (!cache) {
2433 btrfs_free_path(path);
2434 return -ENOMEM;
2435 }
2436
2437 backref_cache_init(cache);
2438
2439 rb_node = rb_first(blocks); 2766 rb_node = rb_first(blocks);
2440 while (rb_node) { 2767 while (rb_node) {
2441 block = rb_entry(rb_node, struct tree_block, rb_node); 2768 block = rb_entry(rb_node, struct tree_block, rb_node);
2442 if (level == -1)
2443 level = block->level;
2444 else
2445 BUG_ON(level != block->level);
2446 if (!block->key_ready) 2769 if (!block->key_ready)
2447 reada_tree_block(rc, block); 2770 reada_tree_block(rc, block);
2448 rb_node = rb_next(rb_node); 2771 rb_node = rb_next(rb_node);
@@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2460 while (rb_node) { 2783 while (rb_node) {
2461 block = rb_entry(rb_node, struct tree_block, rb_node); 2784 block = rb_entry(rb_node, struct tree_block, rb_node);
2462 2785
2463 node = build_backref_tree(rc, cache, &block->key, 2786 node = build_backref_tree(rc, &block->key,
2464 block->level, block->bytenr); 2787 block->level, block->bytenr);
2465 if (IS_ERR(node)) { 2788 if (IS_ERR(node)) {
2466 err = PTR_ERR(node); 2789 err = PTR_ERR(node);
@@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2470 ret = relocate_tree_block(trans, rc, node, &block->key, 2793 ret = relocate_tree_block(trans, rc, node, &block->key,
2471 path); 2794 path);
2472 if (ret < 0) { 2795 if (ret < 0) {
2473 err = ret; 2796 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2797 err = ret;
2474 goto out; 2798 goto out;
2475 } 2799 }
2476 remove_backref_node(cache, node);
2477 rb_node = rb_next(rb_node); 2800 rb_node = rb_next(rb_node);
2478 } 2801 }
2479 2802out:
2480 if (level > 0)
2481 goto out;
2482
2483 free_block_list(blocks); 2803 free_block_list(blocks);
2804 err = finish_pending_nodes(trans, rc, path, err);
2484 2805
2485 /* 2806 btrfs_free_path(path);
2486 * now backrefs of some upper level tree blocks have been cached, 2807 return err;
2487 * try relocating blocks referenced by these upper level blocks. 2808}
2488 */
2489 while (1) {
2490 struct backref_node *upper = NULL;
2491 if (trans->transaction->in_commit ||
2492 trans->transaction->delayed_refs.flushing)
2493 break;
2494 2809
2495 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2810static noinline_for_stack
2496 &upper); 2811int prealloc_file_extent_cluster(struct inode *inode,
2497 if (ret < 0) 2812 struct file_extent_cluster *cluster)
2498 err = ret; 2813{
2499 if (ret != 0) 2814 u64 alloc_hint = 0;
2500 break; 2815 u64 start;
2816 u64 end;
2817 u64 offset = BTRFS_I(inode)->index_cnt;
2818 u64 num_bytes;
2819 int nr = 0;
2820 int ret = 0;
2501 2821
2502 rb_node = rb_first(blocks); 2822 BUG_ON(cluster->start != cluster->boundary[0]);
2503 while (rb_node) { 2823 mutex_lock(&inode->i_mutex);
2504 block = rb_entry(rb_node, struct tree_block, rb_node);
2505 if (trans->transaction->in_commit ||
2506 trans->transaction->delayed_refs.flushing)
2507 goto out;
2508 BUG_ON(!block->key_ready);
2509 node = build_backref_tree(rc, cache, &block->key,
2510 level, block->bytenr);
2511 if (IS_ERR(node)) {
2512 err = PTR_ERR(node);
2513 goto out;
2514 }
2515 2824
2516 ret = relocate_tree_block(trans, rc, node, 2825 ret = btrfs_check_data_free_space(inode, cluster->end +
2517 &block->key, path); 2826 1 - cluster->start);
2518 if (ret < 0) { 2827 if (ret)
2519 err = ret; 2828 goto out;
2520 goto out;
2521 }
2522 remove_backref_node(cache, node);
2523 rb_node = rb_next(rb_node);
2524 }
2525 free_block_list(blocks);
2526 2829
2527 if (upper) { 2830 while (nr < cluster->nr) {
2528 ret = link_to_upper(trans, upper, path); 2831 start = cluster->boundary[nr] - offset;
2529 if (ret < 0) { 2832 if (nr + 1 < cluster->nr)
2530 err = ret; 2833 end = cluster->boundary[nr + 1] - 1 - offset;
2531 break; 2834 else
2532 } 2835 end = cluster->end - offset;
2533 remove_backref_node(cache, upper); 2836
2534 } 2837 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2838 num_bytes = end + 1 - start;
2839 ret = btrfs_prealloc_file_range(inode, 0, start,
2840 num_bytes, num_bytes,
2841 end + 1, &alloc_hint);
2842 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2843 if (ret)
2844 break;
2845 nr++;
2535 } 2846 }
2847 btrfs_free_reserved_data_space(inode, cluster->end +
2848 1 - cluster->start);
2536out: 2849out:
2537 free_block_list(blocks); 2850 mutex_unlock(&inode->i_mutex);
2538 2851 return ret;
2539 ret = finish_pending_nodes(trans, cache, path);
2540 if (ret < 0)
2541 err = ret;
2542
2543 kfree(cache);
2544 btrfs_free_path(path);
2545 return err;
2546} 2852}
2547 2853
2548static noinline_for_stack 2854static noinline_for_stack
@@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2588 u64 offset = BTRFS_I(inode)->index_cnt; 2894 u64 offset = BTRFS_I(inode)->index_cnt;
2589 unsigned long index; 2895 unsigned long index;
2590 unsigned long last_index; 2896 unsigned long last_index;
2591 unsigned int dirty_page = 0;
2592 struct page *page; 2897 struct page *page;
2593 struct file_ra_state *ra; 2898 struct file_ra_state *ra;
2594 int nr = 0; 2899 int nr = 0;
@@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2601 if (!ra) 2906 if (!ra)
2602 return -ENOMEM; 2907 return -ENOMEM;
2603 2908
2604 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2909 ret = prealloc_file_extent_cluster(inode, cluster);
2605 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2910 if (ret)
2911 goto out;
2606 2912
2607 mutex_lock(&inode->i_mutex); 2913 file_ra_state_init(ra, inode->i_mapping);
2608 2914
2609 i_size_write(inode, cluster->end + 1 - offset);
2610 ret = setup_extent_mapping(inode, cluster->start - offset, 2915 ret = setup_extent_mapping(inode, cluster->start - offset,
2611 cluster->end - offset, cluster->start); 2916 cluster->end - offset, cluster->start);
2612 if (ret) 2917 if (ret)
2613 goto out_unlock; 2918 goto out;
2614
2615 file_ra_state_init(ra, inode->i_mapping);
2616 2919
2617 WARN_ON(cluster->start != cluster->boundary[0]); 2920 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2921 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2618 while (index <= last_index) { 2922 while (index <= last_index) {
2923 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2924 if (ret)
2925 goto out;
2926
2619 page = find_lock_page(inode->i_mapping, index); 2927 page = find_lock_page(inode->i_mapping, index);
2620 if (!page) { 2928 if (!page) {
2621 page_cache_sync_readahead(inode->i_mapping, 2929 page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2623 last_index + 1 - index); 2931 last_index + 1 - index);
2624 page = grab_cache_page(inode->i_mapping, index); 2932 page = grab_cache_page(inode->i_mapping, index);
2625 if (!page) { 2933 if (!page) {
2934 btrfs_delalloc_release_metadata(inode,
2935 PAGE_CACHE_SIZE);
2626 ret = -ENOMEM; 2936 ret = -ENOMEM;
2627 goto out_unlock; 2937 goto out;
2628 } 2938 }
2629 } 2939 }
2630 2940
@@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2640 if (!PageUptodate(page)) { 2950 if (!PageUptodate(page)) {
2641 unlock_page(page); 2951 unlock_page(page);
2642 page_cache_release(page); 2952 page_cache_release(page);
2953 btrfs_delalloc_release_metadata(inode,
2954 PAGE_CACHE_SIZE);
2643 ret = -EIO; 2955 ret = -EIO;
2644 goto out_unlock; 2956 goto out;
2645 } 2957 }
2646 } 2958 }
2647 2959
@@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2660 EXTENT_BOUNDARY, GFP_NOFS); 2972 EXTENT_BOUNDARY, GFP_NOFS);
2661 nr++; 2973 nr++;
2662 } 2974 }
2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 2975
2976 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2665 set_page_dirty(page); 2977 set_page_dirty(page);
2666 dirty_page++;
2667 2978
2668 unlock_extent(&BTRFS_I(inode)->io_tree, 2979 unlock_extent(&BTRFS_I(inode)->io_tree,
2669 page_start, page_end, GFP_NOFS); 2980 page_start, page_end, GFP_NOFS);
@@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2671 page_cache_release(page); 2982 page_cache_release(page);
2672 2983
2673 index++; 2984 index++;
2674 if (nr < cluster->nr && 2985 balance_dirty_pages_ratelimited(inode->i_mapping);
2675 page_end + 1 + offset == cluster->boundary[nr]) { 2986 btrfs_throttle(BTRFS_I(inode)->root);
2676 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677 dirty_page);
2678 dirty_page = 0;
2679 }
2680 }
2681 if (dirty_page) {
2682 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683 dirty_page);
2684 } 2987 }
2685 WARN_ON(nr != cluster->nr); 2988 WARN_ON(nr != cluster->nr);
2686out_unlock: 2989out:
2687 mutex_unlock(&inode->i_mutex);
2688 kfree(ra); 2990 kfree(ra);
2689 return ret; 2991 return ret;
2690} 2992}
@@ -2870,9 +3172,6 @@ out:
2870static int block_use_full_backref(struct reloc_control *rc, 3172static int block_use_full_backref(struct reloc_control *rc,
2871 struct extent_buffer *eb) 3173 struct extent_buffer *eb)
2872{ 3174{
2873 struct btrfs_path *path;
2874 struct btrfs_extent_item *ei;
2875 struct btrfs_key key;
2876 u64 flags; 3175 u64 flags;
2877 int ret; 3176 int ret;
2878 3177
@@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2880 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3179 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881 return 1; 3180 return 1;
2882 3181
2883 path = btrfs_alloc_path(); 3182 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2884 BUG_ON(!path); 3183 eb->start, eb->len, NULL, &flags);
2885
2886 key.objectid = eb->start;
2887 key.type = BTRFS_EXTENT_ITEM_KEY;
2888 key.offset = eb->len;
2889
2890 path->search_commit_root = 1;
2891 path->skip_locking = 1;
2892 ret = btrfs_search_slot(NULL, rc->extent_root,
2893 &key, path, 0, 0);
2894 BUG_ON(ret); 3184 BUG_ON(ret);
2895 3185
2896 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897 struct btrfs_extent_item);
2898 flags = btrfs_extent_flags(path->nodes[0], ei);
2899 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3186 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901 ret = 1; 3187 ret = 1;
2902 else 3188 else
2903 ret = 0; 3189 ret = 0;
2904 btrfs_free_path(path);
2905 return ret; 3190 return ret;
2906} 3191}
2907 3192
@@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
3074 struct btrfs_extent_inline_ref *iref; 3359 struct btrfs_extent_inline_ref *iref;
3075 unsigned long ptr; 3360 unsigned long ptr;
3076 unsigned long end; 3361 unsigned long end;
3077 u32 blocksize; 3362 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3078 int ret; 3363 int ret;
3079 int err = 0; 3364 int err = 0;
3080 3365
3081 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082 extent_key->offset);
3083 BUG_ON(ret < 0);
3084 if (ret > 0) {
3085 /* the relocated data is fragmented */
3086 rc->extents_skipped++;
3087 btrfs_release_path(rc->extent_root, path);
3088 return 0;
3089 }
3090
3091 blocksize = btrfs_level_size(rc->extent_root, 0);
3092
3093 eb = path->nodes[0]; 3366 eb = path->nodes[0];
3094 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3367 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3368 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
3170 */ 3443 */
3171static noinline_for_stack 3444static noinline_for_stack
3172int find_next_extent(struct btrfs_trans_handle *trans, 3445int find_next_extent(struct btrfs_trans_handle *trans,
3173 struct reloc_control *rc, struct btrfs_path *path) 3446 struct reloc_control *rc, struct btrfs_path *path,
3447 struct btrfs_key *extent_key)
3174{ 3448{
3175 struct btrfs_key key; 3449 struct btrfs_key key;
3176 struct extent_buffer *leaf; 3450 struct extent_buffer *leaf;
@@ -3225,6 +3499,7 @@ next:
3225 rc->search_start = end + 1; 3499 rc->search_start = end + 1;
3226 } else { 3500 } else {
3227 rc->search_start = key.objectid + key.offset; 3501 rc->search_start = key.objectid + key.offset;
3502 memcpy(extent_key, &key, sizeof(key));
3228 return 0; 3503 return 0;
3229 } 3504 }
3230 } 3505 }
@@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags)
3262 return 0; 3537 return 0;
3263} 3538}
3264 3539
3540static noinline_for_stack
3541int prepare_to_relocate(struct reloc_control *rc)
3542{
3543 struct btrfs_trans_handle *trans;
3544 int ret;
3545
3546 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3547 if (!rc->block_rsv)
3548 return -ENOMEM;
3549
3550 /*
3551 * reserve some space for creating reloc trees.
3552 * btrfs_init_reloc_root will use them when there
3553 * is no reservation in transaction handle.
3554 */
3555 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3556 rc->extent_root->nodesize * 256,
3557 &rc->block_rsv_retries);
3558 if (ret)
3559 return ret;
3560
3561 rc->block_rsv->refill_used = 1;
3562 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3563
3564 memset(&rc->cluster, 0, sizeof(rc->cluster));
3565 rc->search_start = rc->block_group->key.objectid;
3566 rc->extents_found = 0;
3567 rc->nodes_relocated = 0;
3568 rc->merging_rsv_size = 0;
3569 rc->block_rsv_retries = 0;
3570
3571 rc->create_reloc_tree = 1;
3572 set_reloc_control(rc);
3573
3574 trans = btrfs_join_transaction(rc->extent_root, 1);
3575 btrfs_commit_transaction(trans, rc->extent_root);
3576 return 0;
3577}
3265 3578
3266static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3579static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267{ 3580{
3268 struct rb_root blocks = RB_ROOT; 3581 struct rb_root blocks = RB_ROOT;
3269 struct btrfs_key key; 3582 struct btrfs_key key;
3270 struct file_extent_cluster *cluster;
3271 struct btrfs_trans_handle *trans = NULL; 3583 struct btrfs_trans_handle *trans = NULL;
3272 struct btrfs_path *path; 3584 struct btrfs_path *path;
3273 struct btrfs_extent_item *ei; 3585 struct btrfs_extent_item *ei;
@@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3277 int ret; 3589 int ret;
3278 int err = 0; 3590 int err = 0;
3279 3591
3280 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281 if (!cluster)
3282 return -ENOMEM;
3283
3284 path = btrfs_alloc_path(); 3592 path = btrfs_alloc_path();
3285 if (!path) { 3593 if (!path)
3286 kfree(cluster);
3287 return -ENOMEM; 3594 return -ENOMEM;
3288 }
3289
3290 rc->extents_found = 0;
3291 rc->extents_skipped = 0;
3292
3293 rc->search_start = rc->block_group->key.objectid;
3294 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295 GFP_NOFS);
3296
3297 rc->create_reloc_root = 1;
3298 set_reloc_control(rc);
3299 3595
3300 trans = btrfs_start_transaction(rc->extent_root, 1); 3596 ret = prepare_to_relocate(rc);
3301 btrfs_commit_transaction(trans, rc->extent_root); 3597 if (ret) {
3598 err = ret;
3599 goto out_free;
3600 }
3302 3601
3303 while (1) { 3602 while (1) {
3304 trans = btrfs_start_transaction(rc->extent_root, 1); 3603 trans = btrfs_start_transaction(rc->extent_root, 0);
3604
3605 if (update_backref_cache(trans, &rc->backref_cache)) {
3606 btrfs_end_transaction(trans, rc->extent_root);
3607 continue;
3608 }
3305 3609
3306 ret = find_next_extent(trans, rc, path); 3610 ret = find_next_extent(trans, rc, path, &key);
3307 if (ret < 0) 3611 if (ret < 0)
3308 err = ret; 3612 err = ret;
3309 if (ret != 0) 3613 if (ret != 0)
@@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3313 3617
3314 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3618 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315 struct btrfs_extent_item); 3619 struct btrfs_extent_item);
3316 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3620 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3317 item_size = btrfs_item_size_nr(path->nodes[0],
3318 path->slots[0]);
3319 if (item_size >= sizeof(*ei)) { 3621 if (item_size >= sizeof(*ei)) {
3320 flags = btrfs_extent_flags(path->nodes[0], ei); 3622 flags = btrfs_extent_flags(path->nodes[0], ei);
3321 ret = check_extent_flags(flags); 3623 ret = check_extent_flags(flags);
@@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3356 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3658 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357 ret = add_tree_block(rc, &key, path, &blocks); 3659 ret = add_tree_block(rc, &key, path, &blocks);
3358 } else if (rc->stage == UPDATE_DATA_PTRS && 3660 } else if (rc->stage == UPDATE_DATA_PTRS &&
3359 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3661 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360 ret = add_data_references(rc, &key, path, &blocks); 3662 ret = add_data_references(rc, &key, path, &blocks);
3361 } else { 3663 } else {
3362 btrfs_release_path(rc->extent_root, path); 3664 btrfs_release_path(rc->extent_root, path);
3363 ret = 0; 3665 ret = 0;
3364 } 3666 }
3365 if (ret < 0) { 3667 if (ret < 0) {
3366 err = 0; 3668 err = ret;
3367 break; 3669 break;
3368 } 3670 }
3369 3671
3370 if (!RB_EMPTY_ROOT(&blocks)) { 3672 if (!RB_EMPTY_ROOT(&blocks)) {
3371 ret = relocate_tree_blocks(trans, rc, &blocks); 3673 ret = relocate_tree_blocks(trans, rc, &blocks);
3372 if (ret < 0) { 3674 if (ret < 0) {
3675 if (ret != -EAGAIN) {
3676 err = ret;
3677 break;
3678 }
3679 rc->extents_found--;
3680 rc->search_start = key.objectid;
3681 }
3682 }
3683
3684 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3685 rc->block_rsv, 0, 5);
3686 if (ret < 0) {
3687 if (ret != -EAGAIN) {
3373 err = ret; 3688 err = ret;
3689 WARN_ON(1);
3374 break; 3690 break;
3375 } 3691 }
3692 rc->commit_transaction = 1;
3376 } 3693 }
3377 3694
3378 nr = trans->blocks_used; 3695 if (rc->commit_transaction) {
3379 btrfs_end_transaction(trans, rc->extent_root); 3696 rc->commit_transaction = 0;
3697 ret = btrfs_commit_transaction(trans, rc->extent_root);
3698 BUG_ON(ret);
3699 } else {
3700 nr = trans->blocks_used;
3701 btrfs_end_transaction_throttle(trans, rc->extent_root);
3702 btrfs_btree_balance_dirty(rc->extent_root, nr);
3703 }
3380 trans = NULL; 3704 trans = NULL;
3381 btrfs_btree_balance_dirty(rc->extent_root, nr);
3382 3705
3383 if (rc->stage == MOVE_DATA_EXTENTS && 3706 if (rc->stage == MOVE_DATA_EXTENTS &&
3384 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3707 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385 rc->found_file_extent = 1; 3708 rc->found_file_extent = 1;
3386 ret = relocate_data_extent(rc->data_inode, 3709 ret = relocate_data_extent(rc->data_inode,
3387 &key, cluster); 3710 &key, &rc->cluster);
3388 if (ret < 0) { 3711 if (ret < 0) {
3389 err = ret; 3712 err = ret;
3390 break; 3713 break;
3391 } 3714 }
3392 } 3715 }
3393 } 3716 }
3394 btrfs_free_path(path); 3717
3718 btrfs_release_path(rc->extent_root, path);
3719 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3720 GFP_NOFS);
3395 3721
3396 if (trans) { 3722 if (trans) {
3397 nr = trans->blocks_used; 3723 nr = trans->blocks_used;
3398 btrfs_end_transaction(trans, rc->extent_root); 3724 btrfs_end_transaction_throttle(trans, rc->extent_root);
3399 btrfs_btree_balance_dirty(rc->extent_root, nr); 3725 btrfs_btree_balance_dirty(rc->extent_root, nr);
3400 } 3726 }
3401 3727
3402 if (!err) { 3728 if (!err) {
3403 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3729 ret = relocate_file_extent_cluster(rc->data_inode,
3730 &rc->cluster);
3404 if (ret < 0) 3731 if (ret < 0)
3405 err = ret; 3732 err = ret;
3406 } 3733 }
3407 3734
3408 kfree(cluster); 3735 rc->create_reloc_tree = 0;
3736 set_reloc_control(rc);
3409 3737
3410 rc->create_reloc_root = 0; 3738 backref_cache_cleanup(&rc->backref_cache);
3411 smp_mb(); 3739 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3412 3740
3413 if (rc->extents_found > 0) { 3741 err = prepare_to_merge(rc, err);
3414 trans = btrfs_start_transaction(rc->extent_root, 1);
3415 btrfs_commit_transaction(trans, rc->extent_root);
3416 }
3417 3742
3418 merge_reloc_roots(rc); 3743 merge_reloc_roots(rc);
3419 3744
3745 rc->merge_reloc_tree = 0;
3420 unset_reloc_control(rc); 3746 unset_reloc_control(rc);
3747 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3421 3748
3422 /* get rid of pinned extents */ 3749 /* get rid of pinned extents */
3423 trans = btrfs_start_transaction(rc->extent_root, 1); 3750 trans = btrfs_join_transaction(rc->extent_root, 1);
3424 btrfs_commit_transaction(trans, rc->extent_root); 3751 btrfs_commit_transaction(trans, rc->extent_root);
3425 3752out_free:
3753 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3754 btrfs_free_path(path);
3426 return err; 3755 return err;
3427} 3756}
3428 3757
@@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3448 btrfs_set_inode_generation(leaf, item, 1); 3777 btrfs_set_inode_generation(leaf, item, 1);
3449 btrfs_set_inode_size(leaf, item, 0); 3778 btrfs_set_inode_size(leaf, item, 0);
3450 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3779 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3780 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3781 BTRFS_INODE_PREALLOC);
3452 btrfs_mark_buffer_dirty(leaf); 3782 btrfs_mark_buffer_dirty(leaf);
3453 btrfs_release_path(root, path); 3783 btrfs_release_path(root, path);
3454out: 3784out:
@@ -3460,8 +3790,9 @@ out:
3460 * helper to create inode for data relocation. 3790 * helper to create inode for data relocation.
3461 * the inode is in data relocation tree and its link count is 0 3791 * the inode is in data relocation tree and its link count is 0
3462 */ 3792 */
3463static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3793static noinline_for_stack
3464 struct btrfs_block_group_cache *group) 3794struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3795 struct btrfs_block_group_cache *group)
3465{ 3796{
3466 struct inode *inode = NULL; 3797 struct inode *inode = NULL;
3467 struct btrfs_trans_handle *trans; 3798 struct btrfs_trans_handle *trans;
@@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3475 if (IS_ERR(root)) 3806 if (IS_ERR(root))
3476 return ERR_CAST(root); 3807 return ERR_CAST(root);
3477 3808
3478 trans = btrfs_start_transaction(root, 1); 3809 trans = btrfs_start_transaction(root, 6);
3479 BUG_ON(!trans); 3810 if (IS_ERR(trans))
3811 return ERR_CAST(trans);
3480 3812
3481 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3813 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482 if (err) 3814 if (err)
@@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3496out: 3828out:
3497 nr = trans->blocks_used; 3829 nr = trans->blocks_used;
3498 btrfs_end_transaction(trans, root); 3830 btrfs_end_transaction(trans, root);
3499
3500 btrfs_btree_balance_dirty(root, nr); 3831 btrfs_btree_balance_dirty(root, nr);
3501 if (err) { 3832 if (err) {
3502 if (inode) 3833 if (inode)
@@ -3506,6 +3837,21 @@ out:
3506 return inode; 3837 return inode;
3507} 3838}
3508 3839
3840static struct reloc_control *alloc_reloc_control(void)
3841{
3842 struct reloc_control *rc;
3843
3844 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3845 if (!rc)
3846 return NULL;
3847
3848 INIT_LIST_HEAD(&rc->reloc_roots);
3849 backref_cache_init(&rc->backref_cache);
3850 mapping_tree_init(&rc->reloc_root_tree);
3851 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3852 return rc;
3853}
3854
3509/* 3855/*
3510 * function to relocate all extents in a block group. 3856 * function to relocate all extents in a block group.
3511 */ 3857 */
@@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3860 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515 struct reloc_control *rc; 3861 struct reloc_control *rc;
3516 int ret; 3862 int ret;
3863 int rw = 0;
3517 int err = 0; 3864 int err = 0;
3518 3865
3519 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3866 rc = alloc_reloc_control();
3520 if (!rc) 3867 if (!rc)
3521 return -ENOMEM; 3868 return -ENOMEM;
3522 3869
3523 mapping_tree_init(&rc->reloc_root_tree); 3870 rc->extent_root = extent_root;
3524 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3525 INIT_LIST_HEAD(&rc->reloc_roots);
3526 3871
3527 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3872 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3528 BUG_ON(!rc->block_group); 3873 BUG_ON(!rc->block_group);
3529 3874
3530 btrfs_init_workers(&rc->workers, "relocate", 3875 if (!rc->block_group->ro) {
3531 fs_info->thread_pool_size, NULL); 3876 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3532 3877 if (ret) {
3533 rc->extent_root = extent_root; 3878 err = ret;
3534 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3879 goto out;
3880 }
3881 rw = 1;
3882 }
3535 3883
3536 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3884 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3537 if (IS_ERR(rc->data_inode)) { 3885 if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3548 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3896 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3549 3897
3550 while (1) { 3898 while (1) {
3551 rc->extents_found = 0;
3552 rc->extents_skipped = 0;
3553
3554 mutex_lock(&fs_info->cleaner_mutex); 3899 mutex_lock(&fs_info->cleaner_mutex);
3555 3900
3556 btrfs_clean_old_snapshots(fs_info->tree_root); 3901 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3559 mutex_unlock(&fs_info->cleaner_mutex); 3904 mutex_unlock(&fs_info->cleaner_mutex);
3560 if (ret < 0) { 3905 if (ret < 0) {
3561 err = ret; 3906 err = ret;
3562 break; 3907 goto out;
3563 } 3908 }
3564 3909
3565 if (rc->extents_found == 0) 3910 if (rc->extents_found == 0)
@@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3573 invalidate_mapping_pages(rc->data_inode->i_mapping, 3918 invalidate_mapping_pages(rc->data_inode->i_mapping,
3574 0, -1); 3919 0, -1);
3575 rc->stage = UPDATE_DATA_PTRS; 3920 rc->stage = UPDATE_DATA_PTRS;
3576 } else if (rc->stage == UPDATE_DATA_PTRS &&
3577 rc->extents_skipped >= rc->extents_found) {
3578 iput(rc->data_inode);
3579 rc->data_inode = create_reloc_inode(fs_info,
3580 rc->block_group);
3581 if (IS_ERR(rc->data_inode)) {
3582 err = PTR_ERR(rc->data_inode);
3583 rc->data_inode = NULL;
3584 break;
3585 }
3586 rc->stage = MOVE_DATA_EXTENTS;
3587 rc->found_file_extent = 0;
3588 } 3921 }
3589 } 3922 }
3590 3923
@@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3597 WARN_ON(rc->block_group->reserved > 0); 3930 WARN_ON(rc->block_group->reserved > 0);
3598 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3931 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3599out: 3932out:
3933 if (err && rw)
3934 btrfs_set_block_group_rw(extent_root, rc->block_group);
3600 iput(rc->data_inode); 3935 iput(rc->data_inode);
3601 btrfs_stop_workers(&rc->workers);
3602 btrfs_put_block_group(rc->block_group); 3936 btrfs_put_block_group(rc->block_group);
3603 kfree(rc); 3937 kfree(rc);
3604 return err; 3938 return err;
@@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3609 struct btrfs_trans_handle *trans; 3943 struct btrfs_trans_handle *trans;
3610 int ret; 3944 int ret;
3611 3945
3612 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3946 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3613 3947
3614 memset(&root->root_item.drop_progress, 0, 3948 memset(&root->root_item.drop_progress, 0,
3615 sizeof(root->root_item.drop_progress)); 3949 sizeof(root->root_item.drop_progress));
@@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3702 if (list_empty(&reloc_roots)) 4036 if (list_empty(&reloc_roots))
3703 goto out; 4037 goto out;
3704 4038
3705 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4039 rc = alloc_reloc_control();
3706 if (!rc) { 4040 if (!rc) {
3707 err = -ENOMEM; 4041 err = -ENOMEM;
3708 goto out; 4042 goto out;
3709 } 4043 }
3710 4044
3711 mapping_tree_init(&rc->reloc_root_tree);
3712 INIT_LIST_HEAD(&rc->reloc_roots);
3713 btrfs_init_workers(&rc->workers, "relocate",
3714 root->fs_info->thread_pool_size, NULL);
3715 rc->extent_root = root->fs_info->extent_root; 4045 rc->extent_root = root->fs_info->extent_root;
3716 4046
3717 set_reloc_control(rc); 4047 set_reloc_control(rc);
3718 4048
4049 trans = btrfs_join_transaction(rc->extent_root, 1);
4050
4051 rc->merge_reloc_tree = 1;
4052
3719 while (!list_empty(&reloc_roots)) { 4053 while (!list_empty(&reloc_roots)) {
3720 reloc_root = list_entry(reloc_roots.next, 4054 reloc_root = list_entry(reloc_roots.next,
3721 struct btrfs_root, root_list); 4055 struct btrfs_root, root_list);
@@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3735 fs_root->reloc_root = reloc_root; 4069 fs_root->reloc_root = reloc_root;
3736 } 4070 }
3737 4071
3738 trans = btrfs_start_transaction(rc->extent_root, 1);
3739 btrfs_commit_transaction(trans, rc->extent_root); 4072 btrfs_commit_transaction(trans, rc->extent_root);
3740 4073
3741 merge_reloc_roots(rc); 4074 merge_reloc_roots(rc);
3742 4075
3743 unset_reloc_control(rc); 4076 unset_reloc_control(rc);
3744 4077
3745 trans = btrfs_start_transaction(rc->extent_root, 1); 4078 trans = btrfs_join_transaction(rc->extent_root, 1);
3746 btrfs_commit_transaction(trans, rc->extent_root); 4079 btrfs_commit_transaction(trans, rc->extent_root);
3747out: 4080out:
3748 if (rc) { 4081 kfree(rc);
3749 btrfs_stop_workers(&rc->workers);
3750 kfree(rc);
3751 }
3752 while (!list_empty(&reloc_roots)) { 4082 while (!list_empty(&reloc_roots)) {
3753 reloc_root = list_entry(reloc_roots.next, 4083 reloc_root = list_entry(reloc_roots.next,
3754 struct btrfs_root, root_list); 4084 struct btrfs_root, root_list);
@@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3814 btrfs_put_ordered_extent(ordered); 4144 btrfs_put_ordered_extent(ordered);
3815 return 0; 4145 return 0;
3816} 4146}
4147
4148void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4149 struct btrfs_root *root, struct extent_buffer *buf,
4150 struct extent_buffer *cow)
4151{
4152 struct reloc_control *rc;
4153 struct backref_node *node;
4154 int first_cow = 0;
4155 int level;
4156 int ret;
4157
4158 rc = root->fs_info->reloc_ctl;
4159 if (!rc)
4160 return;
4161
4162 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4163 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4164
4165 level = btrfs_header_level(buf);
4166 if (btrfs_header_generation(buf) <=
4167 btrfs_root_last_snapshot(&root->root_item))
4168 first_cow = 1;
4169
4170 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4171 rc->create_reloc_tree) {
4172 WARN_ON(!first_cow && level == 0);
4173
4174 node = rc->backref_cache.path[level];
4175 BUG_ON(node->bytenr != buf->start &&
4176 node->new_bytenr != buf->start);
4177
4178 drop_node_buffer(node);
4179 extent_buffer_get(cow);
4180 node->eb = cow;
4181 node->new_bytenr = cow->start;
4182
4183 if (!node->pending) {
4184 list_move_tail(&node->list,
4185 &rc->backref_cache.pending[level]);
4186 node->pending = 1;
4187 }
4188
4189 if (first_cow)
4190 __mark_block_processed(rc, node);
4191
4192 if (first_cow && level > 0)
4193 rc->nodes_relocated += buf->len;
4194 }
4195
4196 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4197 ret = replace_file_extents(trans, rc, root, cow);
4198 BUG_ON(ret);
4199 }
4200}
4201
4202/*
4203 * called before creating snapshot. it calculates metadata reservation
4204 * requried for relocating tree blocks in the snapshot
4205 */
4206void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4207 struct btrfs_pending_snapshot *pending,
4208 u64 *bytes_to_reserve)
4209{
4210 struct btrfs_root *root;
4211 struct reloc_control *rc;
4212
4213 root = pending->root;
4214 if (!root->reloc_root)
4215 return;
4216
4217 rc = root->fs_info->reloc_ctl;
4218 if (!rc->merge_reloc_tree)
4219 return;
4220
4221 root = root->reloc_root;
4222 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4223 /*
4224 * relocation is in the stage of merging trees. the space
4225 * used by merging a reloc tree is twice the size of
4226 * relocated tree nodes in the worst case. half for cowing
4227 * the reloc tree, half for cowing the fs tree. the space
4228 * used by cowing the reloc tree will be freed after the
4229 * tree is dropped. if we create snapshot, cowing the fs
4230 * tree may use more space than it frees. so we need
4231 * reserve extra space.
4232 */
4233 *bytes_to_reserve += rc->nodes_relocated;
4234}
4235
4236/*
4237 * called after snapshot is created. migrate block reservation
4238 * and create reloc root for the newly created snapshot
4239 */
4240void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4241 struct btrfs_pending_snapshot *pending)
4242{
4243 struct btrfs_root *root = pending->root;
4244 struct btrfs_root *reloc_root;
4245 struct btrfs_root *new_root;
4246 struct reloc_control *rc;
4247 int ret;
4248
4249 if (!root->reloc_root)
4250 return;
4251
4252 rc = root->fs_info->reloc_ctl;
4253 rc->merging_rsv_size += rc->nodes_relocated;
4254
4255 if (rc->merge_reloc_tree) {
4256 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4257 rc->block_rsv,
4258 rc->nodes_relocated);
4259 BUG_ON(ret);
4260 }
4261
4262 new_root = pending->snap;
4263 reloc_root = create_reloc_root(trans, root->reloc_root,
4264 new_root->root_key.objectid);
4265
4266 __add_reloc_root(reloc_root);
4267 new_root->reloc_root = reloc_root;
4268
4269 if (rc->create_reloc_tree) {
4270 ret = clone_backref_node(trans, rc, root, reloc_root);
4271 BUG_ON(ret);
4272 }
4273}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..b91ccd972644 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..d34b2dfc9628 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
498 btrfs_start_delalloc_inodes(root, 0); 498 btrfs_start_delalloc_inodes(root, 0);
499 btrfs_wait_ordered_extents(root, 0, 0); 499 btrfs_wait_ordered_extents(root, 0, 0);
500 500
501 trans = btrfs_start_transaction(root, 1); 501 trans = btrfs_start_transaction(root, 0);
502 ret = btrfs_commit_transaction(trans, root); 502 ret = btrfs_commit_transaction(trans, root);
503 return ret; 503 return ret;
504} 504}
@@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
695 return -EINVAL; 695 return -EINVAL;
696 696
697 /* recover relocation */ 697 ret = btrfs_cleanup_fs_roots(root->fs_info);
698 ret = btrfs_recover_relocation(root);
699 WARN_ON(ret); 698 WARN_ON(ret);
700 699
701 ret = btrfs_cleanup_fs_roots(root->fs_info); 700 /* recover relocation */
701 ret = btrfs_recover_relocation(root);
702 WARN_ON(ret); 702 WARN_ON(ret);
703 703
704 sb->s_flags &= ~MS_RDONLY; 704 sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
714 struct list_head *head = &root->fs_info->space_info; 714 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found; 715 struct btrfs_space_info *found;
716 u64 total_used = 0; 716 u64 total_used = 0;
717 u64 data_used = 0;
718 int bits = dentry->d_sb->s_blocksize_bits; 717 int bits = dentry->d_sb->s_blocksize_bits;
719 __be32 *fsid = (__be32 *)root->fs_info->fsid; 718 __be32 *fsid = (__be32 *)root->fs_info->fsid;
720 719
721 rcu_read_lock(); 720 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) { 721 list_for_each_entry_rcu(found, head, list)
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 722 total_used += found->disk_used;
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock(); 723 rcu_read_unlock();
740 724
741 buf->f_namelen = BTRFS_NAME_LEN; 725 buf->f_namelen = BTRFS_NAME_LEN;
742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 726 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
743 buf->f_bfree = buf->f_blocks - (total_used >> bits); 727 buf->f_bfree = buf->f_blocks - (total_used >> bits);
744 buf->f_bavail = buf->f_blocks - (data_used >> bits); 728 buf->f_bavail = buf->f_bfree;
745 buf->f_bsize = dentry->d_sb->s_blocksize; 729 buf->f_bsize = dentry->d_sb->s_blocksize;
746 buf->f_type = BTRFS_SUPER_MAGIC; 730 buf->f_type = BTRFS_SUPER_MAGIC;
747 731
@@ -832,11 +816,14 @@ static const struct file_operations btrfs_ctl_fops = {
832}; 816};
833 817
834static struct miscdevice btrfs_misc = { 818static struct miscdevice btrfs_misc = {
835 .minor = MISC_DYNAMIC_MINOR, 819 .minor = BTRFS_MINOR,
836 .name = "btrfs-control", 820 .name = "btrfs-control",
837 .fops = &btrfs_ctl_fops 821 .fops = &btrfs_ctl_fops
838}; 822};
839 823
824MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
825MODULE_ALIAS("devname:btrfs-control");
826
840static int btrfs_interface_init(void) 827static int btrfs_interface_init(void)
841{ 828{
842 return misc_register(&btrfs_misc); 829 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166}; 166};
167 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
168static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
169 int num_blocks, int type) 178 u64 num_items, int type)
170{ 179{
171 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
172 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
173 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
174 188
175 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
176 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
177 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
178 type == TRANS_USERSPACE))
179 wait_current_trans(root); 191 wait_current_trans(root);
192
180 ret = join_transaction(root); 193 ret = join_transaction(root);
181 BUG_ON(ret); 194 BUG_ON(ret);
182 195
183 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
184 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
185 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
186 h->blocks_used = 0; 202 h->blocks_used = 0;
187 h->block_group = 0; 203 h->block_group = 0;
188 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
189 h->alloc_exclude_start = 0;
190 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
191 207
192 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
193 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
194 226
195 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
196 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
197 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
198 return h; 233 return h;
199} 234}
200 235
201struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
202 int num_blocks) 237 int num_items)
203{ 238{
204 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
205} 240}
206struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
207 int num_blocks) 242 int num_blocks)
208{ 243{
209 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
210} 245}
211 246
212struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
213 int num_blocks) 248 int num_blocks)
214{ 249{
215 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
216} 251}
217 252
218/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
286 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
287} 322}
288 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
289static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
291{ 352{
292 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
293 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
294 int count = 0; 355 int count = 0;
295 356
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
313 count++; 374 count++;
314 } 375 }
315 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
316 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
317 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
318 WARN_ON(cur_trans != trans->transaction);
319 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
320 cur_trans->num_writers--; 393 cur_trans->num_writers--;
321 394
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
603 676
604 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
605 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
606 680
607 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
608 switch_commit_root(root); 682 switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
627int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
628{ 702{
629 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
630 int ret;
631 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
632 unsigned long nr; 706 unsigned long nr;
633 707
634 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
635 if (root->defrag_running)
636 return 0; 709 return 0;
637 trans = btrfs_start_transaction(root, 1); 710
638 while (1) { 711 while (1) {
639 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
640 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
641 nr = trans->blocks_used; 718 nr = trans->blocks_used;
642 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
643 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
644 cond_resched(); 721 cond_resched();
645 722
646 trans = btrfs_start_transaction(root, 1);
647 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
648 break; 724 break;
649 } 725 }
650 root->defrag_running = 0; 726 root->defrag_running = 0;
651 smp_mb(); 727 return ret;
652 btrfs_end_transaction(trans, root);
653 return 0;
654} 728}
655 729
656#if 0 730#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
758 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root; 833 struct btrfs_root *parent_root;
760 struct inode *parent_inode; 834 struct inode *parent_inode;
835 struct dentry *dentry;
761 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
762 struct extent_buffer *old; 837 struct extent_buffer *old;
763 int ret; 838 int ret;
764 u64 objectid; 839 int retries = 0;
765 int namelen; 840 u64 to_reserve = 0;
766 u64 index = 0; 841 u64 index = 0;
767 842 u64 objectid;
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
770 843
771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
772 if (!new_root_item) { 845 if (!new_root_item) {
773 ret = -ENOMEM; 846 pending->error = -ENOMEM;
774 goto fail; 847 goto fail;
775 } 848 }
849
776 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
777 if (ret) 851 if (ret) {
852 pending->error = ret;
778 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
779 867
780 key.objectid = objectid; 868 key.objectid = objectid;
781 /* record when the snapshot was created in key.offset */ 869 key.offset = (u64)-1;
782 key.offset = trans->transid; 870 key.type = BTRFS_ROOT_ITEM_KEY;
783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
784 871
785 memcpy(&pending->root_key, &key, sizeof(key)); 872 trans->block_rsv = &pending->block_rsv;
786 pending->root_key.offset = (u64)-1;
787 873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
788 record_root_in_trans(trans, parent_root); 877 record_root_in_trans(trans, parent_root);
878
789 /* 879 /*
790 * insert the directory item 880 * insert the directory item
791 */ 881 */
792 namelen = strlen(pending->name);
793 ret = btrfs_set_inode_index(parent_inode, &index); 882 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret); 883 BUG_ON(ret);
795 ret = btrfs_insert_dir_item(trans, parent_root, 884 ret = btrfs_insert_dir_item(trans, parent_root,
796 pending->name, namelen, 885 dentry->d_name.name, dentry->d_name.len,
797 parent_inode->i_ino, 886 parent_inode->i_ino, &key,
798 &pending->root_key, BTRFS_FT_DIR, index); 887 BTRFS_FT_DIR, index);
799 BUG_ON(ret); 888 BUG_ON(ret);
800 889
801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
802 ret = btrfs_update_inode(trans, parent_root, parent_inode); 892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
803 BUG_ON(ret); 893 BUG_ON(ret);
804 894
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
815 free_extent_buffer(old); 905 free_extent_buffer(old);
816 906
817 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
819 new_root_item); 909 key.offset = trans->transid;
820 BUG_ON(ret); 910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
821 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
913 BUG_ON(ret);
823 914
824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 915 /*
825 pending->root_key.objectid, 916 * insert root back/forward references
917 */
918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
826 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
828 namelen); 921 dentry->d_name.name, dentry->d_name.len);
829 BUG_ON(ret); 922 BUG_ON(ret);
830 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
831fail: 930fail:
832 kfree(new_root_item); 931 kfree(new_root_item);
833 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
834} 934}
835 935
836/* 936/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
878 return ret; 978 return ret;
879} 979}
880 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
881int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
882 struct btrfs_root *root) 992 struct btrfs_root *root)
883{ 993{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
899 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
900 BUG_ON(ret); 1010 BUG_ON(ret);
901 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
902 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
903 /* 1015 /*
904 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
951 snap_pending = 1; 1063 snap_pending = 1;
952 1064
953 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
954 prepare_to_wait(&cur_trans->writer_wait, &wait,
955 TASK_UNINTERRUPTIBLE);
956
957 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
958 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
959 else if (should_grow) 1068 else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
976 */ 1085 */
977 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
978 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
979 smp_mb(); 1091 smp_mb();
980 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
981 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1103 1215
1104 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1105 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1106 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1107 else 1219 else
1108 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1109 } 1221 }
1110 return 0; 1222 return 0;
1111} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
135 struct btrfs_root *root) 135 struct btrfs_root *root)
136{ 136{
137 int ret; 137 int ret;
138 int err = 0;
138 139
139 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
140 if (root->log_root) { 141 if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
155 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
156 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
157 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
158 BUG_ON(ret); 159 if (ret)
160 err = ret;
159 } 161 }
160 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
161 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
162 BUG_ON(ret); 164 if (ret)
165 err = ret;
163 } 166 }
164 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
165 root->log_batch++; 168 root->log_batch++;
166 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
167 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
168 return 0; 171 return err;
169} 172}
170 173
171/* 174/*
@@ -376,7 +379,7 @@ insert:
376 BUG_ON(ret); 379 BUG_ON(ret);
377 } 380 }
378 } else if (ret) { 381 } else if (ret) {
379 BUG(); 382 return ret;
380 } 383 }
381 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
382 path->slots[0]); 385 path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1699 1702
1700 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1701 1704
1702 wc->process_func(root, next, wc, ptr_gen);
1703
1704 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1705 path->slots[*level]++; 1708 path->slots[*level]++;
1706 if (wc->free) { 1709 if (wc->free) {
1707 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1734 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1736 1739
1737 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1738 parent = path->nodes[*level];
1739 else
1740 parent = path->nodes[*level + 1];
1741
1742 bytenr = path->nodes[*level]->start;
1743
1744 blocksize = btrfs_level_size(root, *level);
1745 root_owner = btrfs_header_owner(parent);
1746 root_gen = btrfs_header_generation(parent);
1747
1748 wc->process_func(root, path->nodes[*level], wc,
1749 btrfs_header_generation(path->nodes[*level]));
1750
1751 if (wc->free) {
1752 next = path->nodes[*level];
1753 btrfs_tree_lock(next);
1754 clean_tree_block(trans, root, next);
1755 btrfs_set_lock_blocking(next);
1756 btrfs_wait_tree_block_writeback(next);
1757 btrfs_tree_unlock(next);
1758
1759 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1760 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1761 BUG_ON(ret);
1762 }
1763 free_extent_buffer(path->nodes[*level]);
1764 path->nodes[*level] = NULL;
1765 *level += 1;
1766 1741
1767 cond_resched(); 1742 cond_resched();
1768 return 0; 1743 return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1781 1756
1782 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1783 slot = path->slots[i]; 1758 slot = path->slots[i];
1784 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1785 struct extent_buffer *node; 1760 struct extent_buffer *node;
1786 node = path->nodes[i]; 1761 node = path->nodes[i];
1787 path->slots[i]++; 1762 path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2047 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2048 2023
2049 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2050 BUG_ON(ret);
2051 2025
2052 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2053 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2056 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2057 } 2031 }
2058 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2059 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2060 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2061 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
2129 return 0; 2112 return 0;
2130} 2113}
2131 2114
2132/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2133 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2134 * at commit time of the full transaction
2135 */
2136int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2137{ 2117{
2138 int ret; 2118 int ret;
2139 struct btrfs_root *log;
2140 struct key;
2141 u64 start; 2119 u64 start;
2142 u64 end; 2120 u64 end;
2143 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2145 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2146 }; 2124 };
2147 2125
2148 if (!root->log_root || root->fs_info->log_root_recovering)
2149 return 0;
2150
2151 log = root->log_root;
2152 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2153 BUG_ON(ret); 2127 BUG_ON(ret);
2154 2128
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2162 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2163 } 2137 }
2164 2138
2165 if (log->log_transid > 0) {
2166 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2167 &log->root_key);
2168 BUG_ON(ret);
2169 }
2170 root->log_root = NULL;
2171 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2172 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2173 return 0; 2163 return 0;
2174} 2164}
2175 2165
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2203 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2204 struct btrfs_path *path; 2194 struct btrfs_path *path;
2205 int ret; 2195 int ret;
2196 int err = 0;
2206 int bytes_del = 0; 2197 int bytes_del = 0;
2207 2198
2208 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2218 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2219 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2220 name, name_len, -1); 2211 name, name_len, -1);
2221 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2222 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2223 bytes_del += name_len; 2218 bytes_del += name_len;
2224 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2226 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2227 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2228 index, name, name_len, -1); 2223 index, name, name_len, -1);
2229 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2230 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2231 bytes_del += name_len; 2230 bytes_del += name_len;
2232 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2244 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2245 2244
2246 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2247 if (ret == 0) { 2250 if (ret == 0) {
2248 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2249 u64 i_size; 2252 u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2261 ret = 0; 2264 ret = 0;
2262 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2263 } 2266 }
2264 2267fail:
2265 btrfs_free_path(path); 2268 btrfs_free_path(path);
2266 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2267 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2268 2275
2269 return 0; 2276 return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2291 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2292 dirid, &index); 2299 dirid, &index);
2293 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2294 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2295 2306
2296 return ret; 2307 return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2318 else 2329 else
2319 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2320 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2321 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2322 2334
2323 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2324 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2343 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2344 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2345 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2346 int ret; 2359 int ret;
2347 int i; 2360 int i;
2348 int nritems; 2361 int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2405 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2406 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2407 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2408 } 2425 }
2409 } 2426 }
2410 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2432 goto done; 2449 goto done;
2433 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2434 &min_key); 2451 &min_key);
2435 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2436 } 2456 }
2437 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2438 2458
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2454 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2455 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2456 &tmp); 2476 &tmp);
2457 2477 if (ret)
2458 BUG_ON(ret); 2478 err = ret;
2459 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2460 goto done; 2481 goto done;
2461 } 2482 }
2462 } 2483 }
2463done: 2484done:
2464 *last_offset_ret = last_offset;
2465 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2466 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2467 2487
2468 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2469 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2470 first_offset, last_offset); 2490 /*
2471 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2472 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2473} 2501}
2474 2502
2475/* 2503/*
@@ -2501,7 +2529,8 @@ again:
2501 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2502 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2503 &max_key); 2531 &max_key);
2504 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2505 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2506 break; 2535 break;
2507 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2535 2564
2536 while (1) { 2565 while (1) {
2537 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2538 2567 BUG_ON(ret == 0);
2539 if (ret != 1) 2568 if (ret < 0)
2540 break; 2569 break;
2541 2570
2542 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2554 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2555 } 2584 }
2556 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2557 return 0; 2586 return ret;
2558} 2587}
2559 2588
2560static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2587 } 2616 }
2588 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2589 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2590 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2591 2623
2592 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2593 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2660 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2661 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2662 */ 2694 */
2695 ret = 0;
2663 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2664 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2665 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2666 list); 2699 list);
2667 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2668 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2669 list_del(&sums->list); 2702 list_del(&sums->list);
2670 kfree(sums); 2703 kfree(sums);
2671 } 2704 }
2672 return 0; 2705 return ret;
2673} 2706}
2674 2707
2675/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2697 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2698 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2699 u32 size; 2732 u32 size;
2733 int err = 0;
2700 int ret; 2734 int ret;
2701 int nritems; 2735 int nritems;
2702 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2739 } else { 2773 } else {
2740 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2741 } 2775 }
2742 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2743 path->keep_locks = 1; 2780 path->keep_locks = 1;
2744 2781
2745 while (1) { 2782 while (1) {
@@ -2768,7 +2805,10 @@ again:
2768 2805
2769 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2770 ins_nr, inode_only); 2807 ins_nr, inode_only);
2771 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2772 ins_nr = 1; 2812 ins_nr = 1;
2773 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2774next_slot: 2814next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
2784 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2785 ins_start_slot, 2825 ins_start_slot,
2786 ins_nr, inode_only); 2826 ins_nr, inode_only);
2787 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2788 ins_nr = 0; 2831 ins_nr = 0;
2789 } 2832 }
2790 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
2802 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2803 ins_start_slot, 2846 ins_start_slot,
2804 ins_nr, inode_only); 2847 ins_nr, inode_only);
2805 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2806 ins_nr = 0; 2852 ins_nr = 0;
2807 } 2853 }
2808 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
2810 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2811 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2812 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2813 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2814 } 2863 }
2815 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2816 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2817 2867
2818 btrfs_free_path(path); 2868 btrfs_free_path(path);
2819 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2820 return 0; 2870 return err;
2821} 2871}
2822 2872
2823/* 2873/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2942 goto end_no_trans; 2992 goto end_no_trans;
2943 } 2993 }
2944 2994
2945 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2946 2998
2947 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2948 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2949 3002
2950 /* 3003 /*
2951 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2955 */ 3008 */
2956 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2957 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2958 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2959 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2960 3015
2961 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2962 while (1) { 3017 while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2970 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2971 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2972 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2973 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2974 } 3030 }
2975 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2976 break; 3032 break;
2977 3033
2978 parent = parent->d_parent; 3034 parent = parent->d_parent;
2979 } 3035 }
2980no_parent:
2981 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2982 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2983end_no_trans: 3044end_no_trans:
2984 return ret; 3045 return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3020 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3021 BUG_ON(!path); 3082 BUG_ON(!path);
3022 3083
3023 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3024 3085
3025 wc.trans = trans; 3086 wc.trans = trans;
3026 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1097 if (!path) 1097 if (!path)
1098 return -ENOMEM; 1098 return -ENOMEM;
1099 1099
1100 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1486 goto error; 1486 goto error;
1487 } 1487 }
1488 1488
1489 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1490 lock_chunks(root);
1491 1491
1492 device->barriers = 1; 1492 device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1751 1751
1752 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1754 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1755 1756
1756 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1757 BUG_ON(!trans); 1758 BUG_ON(!trans);
1758 1759
1759 lock_chunks(root); 1760 lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1925 break; 1926 break;
1926 BUG_ON(ret); 1927 BUG_ON(ret);
1927 1928
1928 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1929 BUG_ON(!trans); 1930 BUG_ON(!trans);
1930 1931
1931 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
2094 } 2095 }
2095 2096
2096 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2097 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2098 if (!trans) {
2099 ret = -ENOMEM;
2100 goto done;
2101 }
2102 lock_chunks(root); 2099 lock_chunks(root);
2103 2100
2104 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..f0b391c50552 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
568 return ret; 568 return ret;
569} 569}
570 570
571/* A write operation does a read from user space and vice versa */
572#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
573
574ssize_t compat_rw_copy_check_uvector(int type,
575 const struct compat_iovec __user *uvector, unsigned long nr_segs,
576 unsigned long fast_segs, struct iovec *fast_pointer,
577 struct iovec **ret_pointer)
578{
579 compat_ssize_t tot_len;
580 struct iovec *iov = *ret_pointer = fast_pointer;
581 ssize_t ret = 0;
582 int seg;
583
584 /*
585 * SuS says "The readv() function *may* fail if the iovcnt argument
586 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
587 * traditionally returned zero for zero segments, so...
588 */
589 if (nr_segs == 0)
590 goto out;
591
592 ret = -EINVAL;
593 if (nr_segs > UIO_MAXIOV || nr_segs < 0)
594 goto out;
595 if (nr_segs > fast_segs) {
596 ret = -ENOMEM;
597 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
598 if (iov == NULL) {
599 *ret_pointer = fast_pointer;
600 goto out;
601 }
602 }
603 *ret_pointer = iov;
604
605 /*
606 * Single unix specification:
607 * We should -EINVAL if an element length is not >= 0 and fitting an
608 * ssize_t. The total length is fitting an ssize_t
609 *
610 * Be careful here because iov_len is a size_t not an ssize_t
611 */
612 tot_len = 0;
613 ret = -EINVAL;
614 for (seg = 0; seg < nr_segs; seg++) {
615 compat_ssize_t tmp = tot_len;
616 compat_uptr_t buf;
617 compat_ssize_t len;
618
619 if (__get_user(len, &uvector->iov_len) ||
620 __get_user(buf, &uvector->iov_base)) {
621 ret = -EFAULT;
622 goto out;
623 }
624 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
625 goto out;
626 tot_len += len;
627 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
628 goto out;
629 if (!access_ok(vrfy_dir(type), buf, len)) {
630 ret = -EFAULT;
631 goto out;
632 }
633 iov->iov_base = compat_ptr(buf);
634 iov->iov_len = (compat_size_t) len;
635 uvector++;
636 iov++;
637 }
638 ret = tot_len;
639
640out:
641 return ret;
642}
643
571static inline long 644static inline long
572copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) 645copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
573{ 646{
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
600 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); 673 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
601 ret = copy_iocb(nr, iocb, iocb64); 674 ret = copy_iocb(nr, iocb, iocb64);
602 if (!ret) 675 if (!ret)
603 ret = sys_io_submit(ctx_id, nr, iocb64); 676 ret = do_io_submit(ctx_id, nr, iocb64, 1);
604 return ret; 677 return ret;
605} 678}
606 679
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1077{ 1150{
1078 compat_ssize_t tot_len; 1151 compat_ssize_t tot_len;
1079 struct iovec iovstack[UIO_FASTIOV]; 1152 struct iovec iovstack[UIO_FASTIOV];
1080 struct iovec *iov=iovstack, *vector; 1153 struct iovec *iov;
1081 ssize_t ret; 1154 ssize_t ret;
1082 int seg;
1083 io_fn_t fn; 1155 io_fn_t fn;
1084 iov_fn_t fnv; 1156 iov_fn_t fnv;
1085 1157
1086 /*
1087 * SuS says "The readv() function *may* fail if the iovcnt argument
1088 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
1089 * traditionally returned zero for zero segments, so...
1090 */
1091 ret = 0;
1092 if (nr_segs == 0)
1093 goto out;
1094
1095 /*
1096 * First get the "struct iovec" from user memory and
1097 * verify all the pointers
1098 */
1099 ret = -EINVAL; 1158 ret = -EINVAL;
1100 if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
1101 goto out;
1102 if (!file->f_op) 1159 if (!file->f_op)
1103 goto out; 1160 goto out;
1104 if (nr_segs > UIO_FASTIOV) { 1161
1105 ret = -ENOMEM;
1106 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
1107 if (!iov)
1108 goto out;
1109 }
1110 ret = -EFAULT; 1162 ret = -EFAULT;
1111 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 1163 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
1112 goto out; 1164 goto out;
1113 1165
1114 /* 1166 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1115 * Single unix specification: 1167 UIO_FASTIOV, iovstack, &iov);
1116 * We should -EINVAL if an element length is not >= 0 and fitting an
1117 * ssize_t. The total length is fitting an ssize_t
1118 *
1119 * Be careful here because iov_len is a size_t not an ssize_t
1120 */
1121 tot_len = 0;
1122 vector = iov;
1123 ret = -EINVAL;
1124 for (seg = 0 ; seg < nr_segs; seg++) {
1125 compat_ssize_t tmp = tot_len;
1126 compat_ssize_t len;
1127 compat_uptr_t buf;
1128
1129 if (__get_user(len, &uvector->iov_len) ||
1130 __get_user(buf, &uvector->iov_base)) {
1131 ret = -EFAULT;
1132 goto out;
1133 }
1134 if (len < 0) /* size_t not fitting an compat_ssize_t .. */
1135 goto out;
1136 tot_len += len;
1137 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
1138 goto out;
1139 vector->iov_base = compat_ptr(buf);
1140 vector->iov_len = (compat_size_t) len;
1141 uvector++;
1142 vector++;
1143 }
1144 if (tot_len == 0) { 1168 if (tot_len == 0) {
1145 ret = 0; 1169 ret = 0;
1146 goto out; 1170 goto out;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..da111aacb46e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
82 int reap_counter; /* rate limit reaping */ 82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 83 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */ 84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */
86 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 88 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 89 in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
96 unsigned cur_page_offset; /* Offset into it, in bytes */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */
97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
98 sector_t cur_page_block; /* Where it starts */ 100 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */
99 102
100 /* BIO completion state */ 103 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */ 104 spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
300 spin_unlock_irqrestore(&dio->bio_lock, flags); 303 spin_unlock_irqrestore(&dio->bio_lock, flags);
301} 304}
302 305
306/**
307 * dio_end_io - handle the end io action for the given bio
308 * @bio: The direct io bio thats being completed
309 * @error: Error if there was one
310 *
311 * This is meant to be called by any filesystem that uses their own dio_submit_t
312 * so that the DIO specific endio actions are dealt with after the filesystem
313 * has done it's completion work.
314 */
315void dio_end_io(struct bio *bio, int error)
316{
317 struct dio *dio = bio->bi_private;
318
319 if (dio->is_async)
320 dio_bio_end_aio(bio, error);
321 else
322 dio_bio_end_io(bio, error);
323}
324EXPORT_SYMBOL_GPL(dio_end_io);
325
303static int 326static int
304dio_bio_alloc(struct dio *dio, struct block_device *bdev, 327dio_bio_alloc(struct dio *dio, struct block_device *bdev,
305 sector_t first_sector, int nr_vecs) 328 sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
316 bio->bi_end_io = dio_bio_end_io; 339 bio->bi_end_io = dio_bio_end_io;
317 340
318 dio->bio = bio; 341 dio->bio = bio;
342 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
319 return 0; 343 return 0;
320} 344}
321 345
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
340 if (dio->is_async && dio->rw == READ) 364 if (dio->is_async && dio->rw == READ)
341 bio_set_pages_dirty(bio); 365 bio_set_pages_dirty(bio);
342 366
343 submit_bio(dio->rw, bio); 367 if (dio->submit_io)
368 dio->submit_io(dio->rw, bio, dio->inode,
369 dio->logical_offset_in_bio);
370 else
371 submit_bio(dio->rw, bio);
344 372
345 dio->bio = NULL; 373 dio->bio = NULL;
346 dio->boundary = 0; 374 dio->boundary = 0;
375 dio->logical_offset_in_bio = 0;
347} 376}
348 377
349/* 378/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
603 int ret = 0; 632 int ret = 0;
604 633
605 if (dio->bio) { 634 if (dio->bio) {
635 loff_t cur_offset = dio->block_in_file << dio->blkbits;
636 loff_t bio_next_offset = dio->logical_offset_in_bio +
637 dio->bio->bi_size;
638
606 /* 639 /*
607 * See whether this new request is contiguous with the old 640 * See whether this new request is contiguous with the old.
641 *
642 * Btrfs cannot handl having logically non-contiguous requests
643 * submitted. For exmple if you have
644 *
645 * Logical: [0-4095][HOLE][8192-12287]
646 * Phyiscal: [0-4095] [4096-8181]
647 *
648 * We cannot submit those pages together as one BIO. So if our
649 * current logical offset in the file does not equal what would
650 * be the next logical offset in the bio, submit the bio we
651 * have.
608 */ 652 */
609 if (dio->final_block_in_bio != dio->cur_page_block) 653 if (dio->final_block_in_bio != dio->cur_page_block ||
654 cur_offset != bio_next_offset)
610 dio_bio_submit(dio); 655 dio_bio_submit(dio);
611 /* 656 /*
612 * Submit now if the underlying fs is about to perform a 657 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
701 dio->cur_page_offset = offset; 746 dio->cur_page_offset = offset;
702 dio->cur_page_len = len; 747 dio->cur_page_len = len;
703 dio->cur_page_block = blocknr; 748 dio->cur_page_block = blocknr;
749 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
704out: 750out:
705 return ret; 751 return ret;
706} 752}
@@ -935,7 +981,7 @@ static ssize_t
935direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 981direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
936 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 982 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
937 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 983 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
938 struct dio *dio) 984 dio_submit_t submit_io, struct dio *dio)
939{ 985{
940 unsigned long user_addr; 986 unsigned long user_addr;
941 unsigned long flags; 987 unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
952 998
953 dio->get_block = get_block; 999 dio->get_block = get_block;
954 dio->end_io = end_io; 1000 dio->end_io = end_io;
1001 dio->submit_io = submit_io;
955 dio->final_block_in_bio = -1; 1002 dio->final_block_in_bio = -1;
956 dio->next_block_for_io = -1; 1003 dio->next_block_for_io = -1;
957 1004
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1008 } 1055 }
1009 } /* end iovec loop */ 1056 } /* end iovec loop */
1010 1057
1011 if (ret == -ENOTBLK && (rw & WRITE)) { 1058 if (ret == -ENOTBLK) {
1012 /* 1059 /*
1013 * The remaining part of the request will be 1060 * The remaining part of the request will be
1014 * be handled by buffered I/O when we return 1061 * be handled by buffered I/O when we return
@@ -1110,7 +1157,7 @@ ssize_t
1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1157__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1111 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1158 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1159 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1113 int flags) 1160 dio_submit_t submit_io, int flags)
1114{ 1161{
1115 int seg; 1162 int seg;
1116 size_t size; 1163 size_t size;
@@ -1197,7 +1244,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1197 (end > i_size_read(inode))); 1244 (end > i_size_read(inode)));
1198 1245
1199 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1246 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1200 nr_segs, blkbits, get_block, end_io, dio); 1247 nr_segs, blkbits, get_block, end_io,
1248 submit_io, dio);
1201 1249
1202 /* 1250 /*
1203 * In case of error extending write may have instantiated a few 1251 * In case of error extending write may have instantiated a few
diff --git a/fs/exec.c b/fs/exec.c
index 9badbc0bfb1d..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
768 struct signal_struct *sig = tsk->signal; 768 struct signal_struct *sig = tsk->signal;
769 struct sighand_struct *oldsighand = tsk->sighand; 769 struct sighand_struct *oldsighand = tsk->sighand;
770 spinlock_t *lock = &oldsighand->siglock; 770 spinlock_t *lock = &oldsighand->siglock;
771 int count;
772 771
773 if (thread_group_empty(tsk)) 772 if (thread_group_empty(tsk))
774 goto no_thread_group; 773 goto no_thread_group;
@@ -785,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
785 spin_unlock_irq(lock); 784 spin_unlock_irq(lock);
786 return -EAGAIN; 785 return -EAGAIN;
787 } 786 }
787
788 sig->group_exit_task = tsk; 788 sig->group_exit_task = tsk;
789 zap_other_threads(tsk); 789 sig->notify_count = zap_other_threads(tsk);
790 if (!thread_group_leader(tsk))
791 sig->notify_count--;
790 792
791 /* Account for the thread group leader hanging around: */ 793 while (sig->notify_count) {
792 count = thread_group_leader(tsk) ? 1 : 2;
793 sig->notify_count = count;
794 while (atomic_read(&sig->count) > count) {
795 __set_current_state(TASK_UNINTERRUPTIBLE); 794 __set_current_state(TASK_UNINTERRUPTIBLE);
796 spin_unlock_irq(lock); 795 spin_unlock_irq(lock);
797 schedule(); 796 schedule();
@@ -1662,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
1662 struct task_struct *tsk = current; 1661 struct task_struct *tsk = current;
1663 struct mm_struct *mm = tsk->mm; 1662 struct mm_struct *mm = tsk->mm;
1664 struct completion *vfork_done; 1663 struct completion *vfork_done;
1665 int core_waiters; 1664 int core_waiters = -EBUSY;
1666 1665
1667 init_completion(&core_state->startup); 1666 init_completion(&core_state->startup);
1668 core_state->dumper.task = tsk; 1667 core_state->dumper.task = tsk;
1669 core_state->dumper.next = NULL; 1668 core_state->dumper.next = NULL;
1670 core_waiters = zap_threads(tsk, mm, core_state, exit_code); 1669
1670 down_write(&mm->mmap_sem);
1671 if (!mm->core_state)
1672 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1671 up_write(&mm->mmap_sem); 1673 up_write(&mm->mmap_sem);
1672 1674
1673 if (unlikely(core_waiters < 0)) 1675 if (unlikely(core_waiters < 0))
@@ -1787,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
1787} 1789}
1788 1790
1789 1791
1792/*
1793 * uhm_pipe_setup
1794 * helper function to customize the process used
1795 * to collect the core in userspace. Specifically
1796 * it sets up a pipe and installs it as fd 0 (stdin)
1797 * for the process. Returns 0 on success, or
1798 * PTR_ERR on failure.
1799 * Note that it also sets the core limit to 1. This
1800 * is a special value that we use to trap recursive
1801 * core dumps
1802 */
1803static int umh_pipe_setup(struct subprocess_info *info)
1804{
1805 struct file *rp, *wp;
1806 struct fdtable *fdt;
1807 struct coredump_params *cp = (struct coredump_params *)info->data;
1808 struct files_struct *cf = current->files;
1809
1810 wp = create_write_pipe(0);
1811 if (IS_ERR(wp))
1812 return PTR_ERR(wp);
1813
1814 rp = create_read_pipe(wp, 0);
1815 if (IS_ERR(rp)) {
1816 free_write_pipe(wp);
1817 return PTR_ERR(rp);
1818 }
1819
1820 cp->file = wp;
1821
1822 sys_close(0);
1823 fd_install(0, rp);
1824 spin_lock(&cf->file_lock);
1825 fdt = files_fdtable(cf);
1826 FD_SET(0, fdt->open_fds);
1827 FD_CLR(0, fdt->close_on_exec);
1828 spin_unlock(&cf->file_lock);
1829
1830 /* and disallow core files too */
1831 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
1832
1833 return 0;
1834}
1835
1790void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1836void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1791{ 1837{
1792 struct core_state core_state; 1838 struct core_state core_state;
1793 char corename[CORENAME_MAX_SIZE + 1]; 1839 char corename[CORENAME_MAX_SIZE + 1];
1794 struct mm_struct *mm = current->mm; 1840 struct mm_struct *mm = current->mm;
1795 struct linux_binfmt * binfmt; 1841 struct linux_binfmt * binfmt;
1796 struct inode * inode;
1797 const struct cred *old_cred; 1842 const struct cred *old_cred;
1798 struct cred *cred; 1843 struct cred *cred;
1799 int retval = 0; 1844 int retval = 0;
1800 int flag = 0; 1845 int flag = 0;
1801 int ispipe = 0; 1846 int ispipe;
1802 char **helper_argv = NULL;
1803 int helper_argc = 0;
1804 int dump_count = 0;
1805 static atomic_t core_dump_count = ATOMIC_INIT(0); 1847 static atomic_t core_dump_count = ATOMIC_INIT(0);
1806 struct coredump_params cprm = { 1848 struct coredump_params cprm = {
1807 .signr = signr, 1849 .signr = signr,
@@ -1820,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1820 binfmt = mm->binfmt; 1862 binfmt = mm->binfmt;
1821 if (!binfmt || !binfmt->core_dump) 1863 if (!binfmt || !binfmt->core_dump)
1822 goto fail; 1864 goto fail;
1823 1865 if (!__get_dumpable(cprm.mm_flags))
1824 cred = prepare_creds();
1825 if (!cred) {
1826 retval = -ENOMEM;
1827 goto fail; 1866 goto fail;
1828 }
1829 1867
1830 down_write(&mm->mmap_sem); 1868 cred = prepare_creds();
1831 /* 1869 if (!cred)
1832 * If another thread got here first, or we are not dumpable, bail out.
1833 */
1834 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1835 up_write(&mm->mmap_sem);
1836 put_cred(cred);
1837 goto fail; 1870 goto fail;
1838 }
1839
1840 /* 1871 /*
1841 * We cannot trust fsuid as being the "true" uid of the 1872 * We cannot trust fsuid as being the "true" uid of the
1842 * process nor do we know its entire history. We only know it 1873 * process nor do we know its entire history. We only know it
@@ -1849,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1849 } 1880 }
1850 1881
1851 retval = coredump_wait(exit_code, &core_state); 1882 retval = coredump_wait(exit_code, &core_state);
1852 if (retval < 0) { 1883 if (retval < 0)
1853 put_cred(cred); 1884 goto fail_creds;
1854 goto fail;
1855 }
1856 1885
1857 old_cred = override_creds(cred); 1886 old_cred = override_creds(cred);
1858 1887
@@ -1870,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1870 ispipe = format_corename(corename, signr); 1899 ispipe = format_corename(corename, signr);
1871 unlock_kernel(); 1900 unlock_kernel();
1872 1901
1873 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1874 goto fail_unlock;
1875
1876 if (ispipe) { 1902 if (ispipe) {
1877 if (cprm.limit == 0) { 1903 int dump_count;
1904 char **helper_argv;
1905
1906 if (cprm.limit == 1) {
1878 /* 1907 /*
1879 * Normally core limits are irrelevant to pipes, since 1908 * Normally core limits are irrelevant to pipes, since
1880 * we're not writing to the file system, but we use 1909 * we're not writing to the file system, but we use
1881 * cprm.limit of 0 here as a speacial value. Any 1910 * cprm.limit of 1 here as a speacial value. Any
1882 * non-zero limit gets set to RLIM_INFINITY below, but 1911 * non-1 limit gets set to RLIM_INFINITY below, but
1883 * a limit of 0 skips the dump. This is a consistent 1912 * a limit of 0 skips the dump. This is a consistent
1884 * way to catch recursive crashes. We can still crash 1913 * way to catch recursive crashes. We can still crash
1885 * if the core_pattern binary sets RLIM_CORE = !0 1914 * if the core_pattern binary sets RLIM_CORE = !1
1886 * but it runs as root, and can do lots of stupid things 1915 * but it runs as root, and can do lots of stupid things
1887 * Note that we use task_tgid_vnr here to grab the pid 1916 * Note that we use task_tgid_vnr here to grab the pid
1888 * of the process group leader. That way we get the 1917 * of the process group leader. That way we get the
@@ -1890,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1890 * core_pattern process dies. 1919 * core_pattern process dies.
1891 */ 1920 */
1892 printk(KERN_WARNING 1921 printk(KERN_WARNING
1893 "Process %d(%s) has RLIMIT_CORE set to 0\n", 1922 "Process %d(%s) has RLIMIT_CORE set to 1\n",
1894 task_tgid_vnr(current), current->comm); 1923 task_tgid_vnr(current), current->comm);
1895 printk(KERN_WARNING "Aborting core\n"); 1924 printk(KERN_WARNING "Aborting core\n");
1896 goto fail_unlock; 1925 goto fail_unlock;
1897 } 1926 }
1927 cprm.limit = RLIM_INFINITY;
1898 1928
1899 dump_count = atomic_inc_return(&core_dump_count); 1929 dump_count = atomic_inc_return(&core_dump_count);
1900 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 1930 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1904,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1904 goto fail_dropcount; 1934 goto fail_dropcount;
1905 } 1935 }
1906 1936
1907 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1937 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
1908 if (!helper_argv) { 1938 if (!helper_argv) {
1909 printk(KERN_WARNING "%s failed to allocate memory\n", 1939 printk(KERN_WARNING "%s failed to allocate memory\n",
1910 __func__); 1940 __func__);
1911 goto fail_dropcount; 1941 goto fail_dropcount;
1912 } 1942 }
1913 1943
1914 cprm.limit = RLIM_INFINITY; 1944 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
1915 1945 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
1916 /* SIGPIPE can happen, but it's just never processed */ 1946 NULL, &cprm);
1917 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1947 argv_free(helper_argv);
1918 &cprm.file)) { 1948 if (retval) {
1919 printk(KERN_INFO "Core dump to %s pipe failed\n", 1949 printk(KERN_INFO "Core dump to %s pipe failed\n",
1920 corename); 1950 corename);
1921 goto fail_dropcount; 1951 goto close_fail;
1922 } 1952 }
1923 } else 1953 } else {
1954 struct inode *inode;
1955
1956 if (cprm.limit < binfmt->min_coredump)
1957 goto fail_unlock;
1958
1924 cprm.file = filp_open(corename, 1959 cprm.file = filp_open(corename,
1925 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1960 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1926 0600); 1961 0600);
1927 if (IS_ERR(cprm.file)) 1962 if (IS_ERR(cprm.file))
1928 goto fail_dropcount; 1963 goto fail_unlock;
1929 inode = cprm.file->f_path.dentry->d_inode;
1930 if (inode->i_nlink > 1)
1931 goto close_fail; /* multiple links - don't dump */
1932 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1933 goto close_fail;
1934
1935 /* AK: actually i see no reason to not allow this for named pipes etc.,
1936 but keep the previous behaviour for now. */
1937 if (!ispipe && !S_ISREG(inode->i_mode))
1938 goto close_fail;
1939 /*
1940 * Dont allow local users get cute and trick others to coredump
1941 * into their pre-created files:
1942 * Note, this is not relevant for pipes
1943 */
1944 if (!ispipe && (inode->i_uid != current_fsuid()))
1945 goto close_fail;
1946 if (!cprm.file->f_op)
1947 goto close_fail;
1948 if (!cprm.file->f_op->write)
1949 goto close_fail;
1950 if (!ispipe &&
1951 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1952 goto close_fail;
1953 1964
1954 retval = binfmt->core_dump(&cprm); 1965 inode = cprm.file->f_path.dentry->d_inode;
1966 if (inode->i_nlink > 1)
1967 goto close_fail;
1968 if (d_unhashed(cprm.file->f_path.dentry))
1969 goto close_fail;
1970 /*
1971 * AK: actually i see no reason to not allow this for named
1972 * pipes etc, but keep the previous behaviour for now.
1973 */
1974 if (!S_ISREG(inode->i_mode))
1975 goto close_fail;
1976 /*
1977 * Dont allow local users get cute and trick others to coredump
1978 * into their pre-created files.
1979 */
1980 if (inode->i_uid != current_fsuid())
1981 goto close_fail;
1982 if (!cprm.file->f_op || !cprm.file->f_op->write)
1983 goto close_fail;
1984 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
1985 goto close_fail;
1986 }
1955 1987
1988 retval = binfmt->core_dump(&cprm);
1956 if (retval) 1989 if (retval)
1957 current->signal->group_exit_code |= 0x80; 1990 current->signal->group_exit_code |= 0x80;
1958close_fail: 1991
1959 if (ispipe && core_pipe_limit) 1992 if (ispipe && core_pipe_limit)
1960 wait_for_dump_helpers(cprm.file); 1993 wait_for_dump_helpers(cprm.file);
1961 filp_close(cprm.file, NULL); 1994close_fail:
1995 if (cprm.file)
1996 filp_close(cprm.file, NULL);
1962fail_dropcount: 1997fail_dropcount:
1963 if (dump_count) 1998 if (ispipe)
1964 atomic_dec(&core_dump_count); 1999 atomic_dec(&core_dump_count);
1965fail_unlock: 2000fail_unlock:
1966 if (helper_argv) 2001 coredump_finish(mm);
1967 argv_free(helper_argv);
1968
1969 revert_creds(old_cred); 2002 revert_creds(old_cred);
2003fail_creds:
1970 put_cred(cred); 2004 put_cred(cred);
1971 coredump_finish(mm);
1972fail: 2005fail:
1973 return; 2006 return;
1974} 2007}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 591 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 592 if (count)
593 *count = ar.len; 593 *count = ar.len;
594
595 /* 594 /*
596 * Account for the allocated meta blocks 595 * Account for the allocated meta blocks. We will never
596 * fail EDQUOT for metdata, but we do account for it.
597 */ 597 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
602 dquot_alloc_block_nofail(inode, ar.len);
602 } 603 }
603 return ret; 604 return ret;
604} 605}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
72 else if (start_blk >= (entry->start_blk + entry->count)) 72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right; 73 n = &(*n)->rb_right;
74 else { 74 else {
75 if (start_blk + count > (entry->start_blk + 75 if (start_blk + count > (entry->start_blk +
76 entry->count)) 76 entry->count))
77 entry->count = (start_blk + count - 77 entry->count = (start_blk + count -
78 entry->start_blk); 78 entry->start_blk);
79 new_node = *n; 79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone, 80 new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 __ext4_error(dir->i_sb, function, 86 ext4_error_inode(function, dir,
87 "bad entry in directory #%lu: %s - block=%llu" 87 "bad entry in directory: %s - block=%llu"
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, 89 error_msg, (unsigned long long) bh->b_blocknr,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset, 90 (unsigned) (offset%bh->b_size), offset,
92 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
93 rlen, de->name_len); 92 rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
111 110
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 111 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) && 112 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || 113 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 114 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir); 115 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) { 116 if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
122 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
124 */ 123 */
125 EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
126 } 125 }
127 stored = 0; 126 stored = 0;
128 offset = filp->f_pos & (sb->s_blocksize - 1); 127 offset = filp->f_pos & (sb->s_blocksize - 1);
129 128
130 while (!error && !stored && filp->f_pos < inode->i_size) { 129 while (!error && !stored && filp->f_pos < inode->i_size) {
131 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 130 struct ext4_map_blocks map;
132 struct buffer_head map_bh;
133 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
134 132
135 map_bh.b_state = 0; 133 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
136 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); 134 map.m_len = 1;
135 err = ext4_map_blocks(NULL, inode, &map, 0);
137 if (err > 0) { 136 if (err > 0) {
138 pgoff_t index = map_bh.b_blocknr >> 137 pgoff_t index = map.m_pblk >>
139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 138 (PAGE_CACHE_SHIFT - inode->i_blkbits);
140 if (!ra_has_index(&filp->f_ra, index)) 139 if (!ra_has_index(&filp->f_ra, index))
141 page_cache_sync_readahead( 140 page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
143 &filp->f_ra, filp, 142 &filp->f_ra, filp,
144 index, 1); 143 index, 1);
145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 144 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
146 bh = ext4_bread(NULL, inode, blk, 0, &err); 145 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
147 } 146 }
148 147
149 /* 148 /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
152 */ 151 */
153 if (!bh) { 152 if (!bh) {
154 if (!dir_has_error) { 153 if (!dir_has_error) {
155 ext4_error(sb, "directory #%lu " 154 EXT4_ERROR_INODE(inode, "directory "
156 "contains a hole at offset %Lu", 155 "contains a hole at offset %Lu",
157 inode->i_ino,
158 (unsigned long long) filp->f_pos); 156 (unsigned long long) filp->f_pos);
159 dir_has_error = 1; 157 dir_has_error = 1;
160 } 158 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..60bd31026e7c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#ifdef __KERNEL__
33#include <linux/compat.h>
34#endif
32 35
33/* 36/*
34 * The fourth extended filesystem constants/structures 37 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
54#endif 57#endif
55 58
56#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a); 60 ext4_error_inode(__func__, (inode), (fmt), ## a)
58 61
59#define EXT4_ERROR_FILE(file, fmt, a...) \ 62#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a); 63 ext4_error_file(__func__, (file), (fmt), ## a)
61 64
62/* data type for block offset of block group */ 65/* data type for block offset of block group */
63typedef int ext4_grpblk_t; 66typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
72typedef unsigned int ext4_group_t; 75typedef unsigned int ext4_group_t;
73 76
74/* 77/*
75 * Flags used in mballoc's allocation_context flags field. 78 * Flags used in mballoc's allocation_context flags field.
76 * 79 *
77 * Also used to show what's going on for debugging purposes when the 80 * Also used to show what's going on for debugging purposes when the
78 * flag field is exported via the traceport interface 81 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
126}; 129};
127 130
128/* 131/*
132 * Logical to physical block mapping, used by ext4_map_blocks()
133 *
134 * This structure is used to pass requests into ext4_map_blocks() as
135 * well as to store the information returned by ext4_map_blocks(). It
136 * takes less room on the stack than a struct buffer_head.
137 */
138#define EXT4_MAP_NEW (1 << BH_New)
139#define EXT4_MAP_MAPPED (1 << BH_Mapped)
140#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
141#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
142#define EXT4_MAP_UNINIT (1 << BH_Uninit)
143#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
144 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
145 EXT4_MAP_UNINIT)
146
147struct ext4_map_blocks {
148 ext4_fsblk_t m_pblk;
149 ext4_lblk_t m_lblk;
150 unsigned int m_len;
151 unsigned int m_flags;
152};
153
154/*
129 * For delayed allocation tracking 155 * For delayed allocation tracking
130 */ 156 */
131struct mpage_da_data { 157struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
321 return flags & EXT4_OTHER_FLMASK; 347 return flags & EXT4_OTHER_FLMASK;
322} 348}
323 349
350/*
351 * Inode flags used for atomic set/get
352 */
353enum {
354 EXT4_INODE_SECRM = 0, /* Secure deletion */
355 EXT4_INODE_UNRM = 1, /* Undelete */
356 EXT4_INODE_COMPR = 2, /* Compress file */
357 EXT4_INODE_SYNC = 3, /* Synchronous updates */
358 EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
359 EXT4_INODE_APPEND = 5, /* writes to file may only append */
360 EXT4_INODE_NODUMP = 6, /* do not dump file */
361 EXT4_INODE_NOATIME = 7, /* do not update atime */
362/* Reserved for compression usage... */
363 EXT4_INODE_DIRTY = 8,
364 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
365 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
366 EXT4_INODE_ECOMPR = 11, /* Compression error */
367/* End compression flags --- maybe not all used */
368 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
369 EXT4_INODE_IMAGIC = 13, /* AFS directory */
370 EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
371 EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
372 EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
373 EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
374 EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
375 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
376 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
377 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
378 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
379};
380
381#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
382#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
383 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
384 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
385
386/*
387 * Since it's pretty easy to mix up bit numbers and hex values, and we
388 * can't do a compile-time test for ENUM values, we use a run-time
389 * test to make sure that EXT4_XXX_FL is consistent with respect to
390 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
391 * out so it won't cost any extra space in the compiled kernel image.
392 * But it's important that these values are the same, since we are
393 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
394 * must be consistent with the values of FS_XXX_FL defined in
395 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
396 * ext4 filesystems, and of course the values defined in e2fsprogs.
397 *
398 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
399 */
400static inline void ext4_check_flag_values(void)
401{
402 CHECK_FLAG_VALUE(SECRM);
403 CHECK_FLAG_VALUE(UNRM);
404 CHECK_FLAG_VALUE(COMPR);
405 CHECK_FLAG_VALUE(SYNC);
406 CHECK_FLAG_VALUE(IMMUTABLE);
407 CHECK_FLAG_VALUE(APPEND);
408 CHECK_FLAG_VALUE(NODUMP);
409 CHECK_FLAG_VALUE(NOATIME);
410 CHECK_FLAG_VALUE(DIRTY);
411 CHECK_FLAG_VALUE(COMPRBLK);
412 CHECK_FLAG_VALUE(NOCOMPR);
413 CHECK_FLAG_VALUE(ECOMPR);
414 CHECK_FLAG_VALUE(INDEX);
415 CHECK_FLAG_VALUE(IMAGIC);
416 CHECK_FLAG_VALUE(JOURNAL_DATA);
417 CHECK_FLAG_VALUE(NOTAIL);
418 CHECK_FLAG_VALUE(DIRSYNC);
419 CHECK_FLAG_VALUE(TOPDIR);
420 CHECK_FLAG_VALUE(HUGE_FILE);
421 CHECK_FLAG_VALUE(EXTENTS);
422 CHECK_FLAG_VALUE(EA_INODE);
423 CHECK_FLAG_VALUE(EOFBLOCKS);
424 CHECK_FLAG_VALUE(RESERVED);
425}
426
324/* Used to pass group descriptor data when online resize is done */ 427/* Used to pass group descriptor data when online resize is done */
325struct ext4_new_group_input { 428struct ext4_new_group_input {
326 __u32 group; /* Group number for this data */ 429 __u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
332 __u16 unused; 435 __u16 unused;
333}; 436};
334 437
438#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
439struct compat_ext4_new_group_input {
440 u32 group;
441 compat_u64 block_bitmap;
442 compat_u64 inode_bitmap;
443 compat_u64 inode_table;
444 u32 blocks_count;
445 u16 reserved_blocks;
446 u16 unused;
447};
448#endif
449
335/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ 450/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
336struct ext4_new_group_data { 451struct ext4_new_group_data {
337 __u32 group; 452 __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
355#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 470#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
356 EXT4_GET_BLOCKS_CREATE) 471 EXT4_GET_BLOCKS_CREATE)
357 /* Caller is from the delayed allocation writeout path, 472 /* Caller is from the delayed allocation writeout path,
358 so set the magic i_delalloc_reserve_flag after taking the 473 so set the magic i_delalloc_reserve_flag after taking the
359 inode allocation semaphore for */ 474 inode allocation semaphore for */
360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 475#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
361 /* caller is from the direct IO path, request to creation of an 476 /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
398#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 513#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
399#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 514#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
400 515
516#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
401/* 517/*
402 * ioctl commands in 32 bit emulation 518 * ioctl commands in 32 bit emulation
403 */ 519 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
408#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 524#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
409#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 525#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
410#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 526#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
527#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
411#ifdef CONFIG_JBD2_DEBUG 528#ifdef CONFIG_JBD2_DEBUG
412#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) 529#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
413#endif 530#endif
414#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 531#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
415#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 532#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
533#endif
416 534
417 535
418/* 536/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
616 */ 734 */
617struct ext4_inode_info { 735struct ext4_inode_info {
618 __le32 i_data[15]; /* unconverted */ 736 __le32 i_data[15]; /* unconverted */
619 __u32 i_flags;
620 ext4_fsblk_t i_file_acl;
621 __u32 i_dtime; 737 __u32 i_dtime;
738 ext4_fsblk_t i_file_acl;
622 739
623 /* 740 /*
624 * i_block_group is the number of the block group which contains 741 * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
629 */ 746 */
630 ext4_group_t i_block_group; 747 ext4_group_t i_block_group;
631 unsigned long i_state_flags; /* Dynamic state flags */ 748 unsigned long i_state_flags; /* Dynamic state flags */
749 unsigned long i_flags;
632 750
633 ext4_lblk_t i_dir_start_lookup; 751 ext4_lblk_t i_dir_start_lookup;
634#ifdef CONFIG_EXT4_FS_XATTR 752#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ 1180 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1181 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1182 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1183 EXT4_STATE_NEWENTRY, /* File just added to dir */
1065}; 1184};
1066 1185
1067static inline int ext4_test_inode_state(struct inode *inode, int bit) 1186#define EXT4_INODE_BIT_FNS(name, field) \
1068{ 1187static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags); 1188{ \
1070} 1189 return test_bit(bit, &EXT4_I(inode)->i_##field); \
1071 1190} \
1072static inline void ext4_set_inode_state(struct inode *inode, int bit) 1191static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1073{ 1192{ \
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags); 1193 set_bit(bit, &EXT4_I(inode)->i_##field); \
1194} \
1195static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1196{ \
1197 clear_bit(bit, &EXT4_I(inode)->i_##field); \
1075} 1198}
1076 1199
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit) 1200EXT4_INODE_BIT_FNS(flag, flags)
1078{ 1201EXT4_INODE_BIT_FNS(state, state_flags)
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1081#else 1202#else
1082/* Assume that user mode programs are passing in an ext4fs superblock, not 1203/* Assume that user mode programs are passing in an ext4fs superblock, not
1083 * a kernel struct super_block. This will allow us to call the feature-test 1204 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
1264 1385
1265#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ 1386#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
1266 EXT4_FEATURE_COMPAT_DIR_INDEX) && \ 1387 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
1267 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) 1388 ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
1268#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) 1389#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
1269#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) 1390#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
1270 1391
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1799 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1800 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1801 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1802 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1803 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1804#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1805 void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1772extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1894extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1773extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1895extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1774 int chunk); 1896 int chunk);
1775extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1897extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1776 ext4_lblk_t iblock, unsigned int max_blocks, 1898 struct ext4_map_blocks *map, int flags);
1777 struct buffer_head *bh_result, int flags);
1778extern void ext4_ext_truncate(struct inode *); 1899extern void ext4_ext_truncate(struct inode *);
1779extern void ext4_ext_init(struct super_block *); 1900extern void ext4_ext_init(struct super_block *);
1780extern void ext4_ext_release(struct super_block *); 1901extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1782 loff_t len); 1903 loff_t len);
1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1904extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1784 ssize_t len); 1905 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags);
1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1786 sector_t block, unsigned int max_blocks, 1909 sector_t block, unsigned int max_blocks,
1787 struct buffer_head *bh, int flags); 1910 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
273 return 1; 273 return 1;
274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
275 return 1; 275 return 1;
276 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 276 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
277 return 1; 277 return 1;
278 return 0; 278 return 0;
279} 279}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
284 return 0; 284 return 0;
285 if (!S_ISREG(inode->i_mode)) 285 if (!S_ISREG(inode->i_mode))
286 return 0; 286 return 0;
287 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 287 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
288 return 0; 288 return 0;
289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
290 return 1; 290 return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
297 return 0; 297 return 0;
298 if (EXT4_JOURNAL(inode) == NULL) 298 if (EXT4_JOURNAL(inode) == NULL)
299 return 1; 299 return 1;
300 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 300 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
301 return 0; 301 return 0;
302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
303 return 1; 303 return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
321 return 0; 321 return 0;
322 if (!S_ISREG(inode->i_mode)) 322 if (!S_ISREG(inode->i_mode))
323 return 0; 323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
325 return 0; 325 return 0;
326 if (ext4_should_journal_data(inode)) 326 if (ext4_should_journal_data(inode))
327 return 0; 327 return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
107 if (err <= 0) 107 if (err <= 0)
108 return err; 108 return err;
109 err = ext4_truncate_restart_trans(handle, inode, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /* 110 if (err == 0)
111 * We have dropped i_data_sem so someone might have cached again 111 err = -EAGAIN;
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115 112
116 return err; 113 return err;
117} 114}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186 /* 183 /*
187 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188 * block groups per flexgroup, reserve the first block 185 * block groups per flexgroup, reserve the first block
189 * group for directories and special files. Regular 186 * group for directories and special files. Regular
190 * files will start at the second block group. This 187 * files will start at the second block group. This
191 * tends to speed up directory access and improves 188 * tends to speed up directory access and improves
192 * fsck times. 189 * fsck times.
193 */ 190 */
194 block_group &= ~(flex_size-1); 191 block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
439 return 0; 436 return 0;
440 437
441corrupted: 438corrupted:
442 __ext4_error(inode->i_sb, function, 439 ext4_error_inode(function, inode,
443 "bad header/extent in inode #%lu: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
444 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
446 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
447 max, le16_to_cpu(eh->eh_depth), depth); 444 max, le16_to_cpu(eh->eh_depth), depth);
448 445
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
1622 merge_done = 1; 1619 merge_done = 1;
1623 WARN_ON(eh->eh_entries == 0); 1620 WARN_ON(eh->eh_entries == 0);
1624 if (!eh->eh_entries) 1621 if (!eh->eh_entries)
1625 ext4_error(inode->i_sb, 1622 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1628 } 1623 }
1629 1624
1630 return merge_done; 1625 return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2039 struct ext4_ext_cache *cex; 2034 struct ext4_ext_cache *cex;
2040 int ret = EXT4_EXT_CACHE_NO; 2035 int ret = EXT4_EXT_CACHE_NO;
2041 2036
2042 /* 2037 /*
2043 * We borrow i_block_reservation_lock to protect i_cached_extent 2038 * We borrow i_block_reservation_lock to protect i_cached_extent
2044 */ 2039 */
2045 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2040 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2361 int depth = ext_depth(inode); 2356 int depth = ext_depth(inode);
2362 struct ext4_ext_path *path; 2357 struct ext4_ext_path *path;
2363 handle_t *handle; 2358 handle_t *handle;
2364 int i = 0, err = 0; 2359 int i, err;
2365 2360
2366 ext_debug("truncate since %u\n", start); 2361 ext_debug("truncate since %u\n", start);
2367 2362
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2370 if (IS_ERR(handle)) 2365 if (IS_ERR(handle))
2371 return PTR_ERR(handle); 2366 return PTR_ERR(handle);
2372 2367
2368again:
2373 ext4_ext_invalidate_cache(inode); 2369 ext4_ext_invalidate_cache(inode);
2374 2370
2375 /* 2371 /*
2376 * We start scanning from right side, freeing all the blocks 2372 * We start scanning from right side, freeing all the blocks
2377 * after i_size and walking into the tree depth-wise. 2373 * after i_size and walking into the tree depth-wise.
2378 */ 2374 */
2375 depth = ext_depth(inode);
2379 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2376 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2380 if (path == NULL) { 2377 if (path == NULL) {
2381 ext4_journal_stop(handle); 2378 ext4_journal_stop(handle);
2382 return -ENOMEM; 2379 return -ENOMEM;
2383 } 2380 }
2381 path[0].p_depth = depth;
2384 path[0].p_hdr = ext_inode_hdr(inode); 2382 path[0].p_hdr = ext_inode_hdr(inode);
2385 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2383 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2386 err = -EIO; 2384 err = -EIO;
2387 goto out; 2385 goto out;
2388 } 2386 }
2389 path[0].p_depth = depth; 2387 i = err = 0;
2390 2388
2391 while (i >= 0 && err == 0) { 2389 while (i >= 0 && err == 0) {
2392 if (i == depth) { 2390 if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2480out: 2478out:
2481 ext4_ext_drop_refs(path); 2479 ext4_ext_drop_refs(path);
2482 kfree(path); 2480 kfree(path);
2481 if (err == -EAGAIN)
2482 goto again;
2483 ext4_journal_stop(handle); 2483 ext4_journal_stop(handle);
2484 2484
2485 return err; 2485 return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2544/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2546{
2547 int ret = -EIO; 2547 int ret;
2548 struct bio *bio; 2548 struct bio *bio;
2549 int blkbits, blocksize; 2549 int blkbits, blocksize;
2550 sector_t ee_pblock; 2550 sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2568 len = ee_len; 2568 len = ee_len;
2569 2569
2570 bio = bio_alloc(GFP_NOIO, len); 2570 bio = bio_alloc(GFP_NOIO, len);
2571 if (!bio)
2572 return -ENOMEM;
2573
2571 bio->bi_sector = ee_pblock; 2574 bio->bi_sector = ee_pblock;
2572 bio->bi_bdev = inode->i_sb->s_bdev; 2575 bio->bi_bdev = inode->i_sb->s_bdev;
2573 2576
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2595 submit_bio(WRITE, bio); 2598 submit_bio(WRITE, bio);
2596 wait_for_completion(&event); 2599 wait_for_completion(&event);
2597 2600
2598 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599 ret = 0; 2602 bio_put(bio);
2600 else { 2603 return -EIO;
2601 ret = -EIO;
2602 break;
2603 } 2604 }
2604 bio_put(bio); 2605 bio_put(bio);
2605 ee_len -= done; 2606 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9); 2607 ee_pblock += done << (blkbits - 9);
2607 } 2608 }
2608 return ret; 2609 return 0;
2609} 2610}
2610 2611
2611#define EXT4_EXT_ZERO_LEN 7 2612#define EXT4_EXT_ZERO_LEN 7
2612/* 2613/*
2613 * This function is called by ext4_ext_get_blocks() if someone tries to write 2614 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2615 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2616 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized). 2617 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2620 * c> Splits in three extents: Somone is writing in middle of the extent 2621 * c> Splits in three extents: Somone is writing in middle of the extent
2621 */ 2622 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle, 2623static int ext4_ext_convert_to_initialized(handle_t *handle,
2623 struct inode *inode, 2624 struct inode *inode,
2624 struct ext4_ext_path *path, 2625 struct ext4_map_blocks *map,
2625 ext4_lblk_t iblock, 2626 struct ext4_ext_path *path)
2626 unsigned int max_blocks)
2627{ 2627{
2628 struct ext4_extent *ex, newex, orig_ex; 2628 struct ext4_extent *ex, newex, orig_ex;
2629 struct ext4_extent *ex1 = NULL; 2629 struct ext4_extent *ex1 = NULL;
2630 struct ext4_extent *ex2 = NULL; 2630 struct ext4_extent *ex2 = NULL;
2631 struct ext4_extent *ex3 = NULL; 2631 struct ext4_extent *ex3 = NULL;
2632 struct ext4_extent_header *eh; 2632 struct ext4_extent_header *eh;
2633 ext4_lblk_t ee_block; 2633 ext4_lblk_t ee_block, eof_block;
2634 unsigned int allocated, ee_len, depth; 2634 unsigned int allocated, ee_len, depth;
2635 ext4_fsblk_t newblock; 2635 ext4_fsblk_t newblock;
2636 int err = 0; 2636 int err = 0;
2637 int ret = 0; 2637 int ret = 0;
2638 int may_zeroout;
2639
2640 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2641 "block %llu, max_blocks %u\n", inode->i_ino,
2642 (unsigned long long)map->m_lblk, map->m_len);
2643
2644 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2645 inode->i_sb->s_blocksize_bits;
2646 if (eof_block < map->m_lblk + map->m_len)
2647 eof_block = map->m_lblk + map->m_len;
2638 2648
2639 depth = ext_depth(inode); 2649 depth = ext_depth(inode);
2640 eh = path[depth].p_hdr; 2650 eh = path[depth].p_hdr;
2641 ex = path[depth].p_ext; 2651 ex = path[depth].p_ext;
2642 ee_block = le32_to_cpu(ex->ee_block); 2652 ee_block = le32_to_cpu(ex->ee_block);
2643 ee_len = ext4_ext_get_actual_len(ex); 2653 ee_len = ext4_ext_get_actual_len(ex);
2644 allocated = ee_len - (iblock - ee_block); 2654 allocated = ee_len - (map->m_lblk - ee_block);
2645 newblock = iblock - ee_block + ext_pblock(ex); 2655 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2656
2646 ex2 = ex; 2657 ex2 = ex;
2647 orig_ex.ee_block = ex->ee_block; 2658 orig_ex.ee_block = ex->ee_block;
2648 orig_ex.ee_len = cpu_to_le16(ee_len); 2659 orig_ex.ee_len = cpu_to_le16(ee_len);
2649 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2650 2661
2662 /*
2663 * It is safe to convert extent to initialized via explicit
2664 * zeroout only if extent is fully insde i_size or new_size.
2665 */
2666 may_zeroout = ee_block + ee_len <= eof_block;
2667
2651 err = ext4_ext_get_access(handle, inode, path + depth); 2668 err = ext4_ext_get_access(handle, inode, path + depth);
2652 if (err) 2669 if (err)
2653 goto out; 2670 goto out;
2654 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2671 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2655 if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { 2672 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2656 err = ext4_ext_zeroout(inode, &orig_ex); 2673 err = ext4_ext_zeroout(inode, &orig_ex);
2657 if (err) 2674 if (err)
2658 goto fix_extent_len; 2675 goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2665 return allocated; 2682 return allocated;
2666 } 2683 }
2667 2684
2668 /* ex1: ee_block to iblock - 1 : uninitialized */ 2685 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2669 if (iblock > ee_block) { 2686 if (map->m_lblk > ee_block) {
2670 ex1 = ex; 2687 ex1 = ex;
2671 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2688 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2672 ext4_ext_mark_uninitialized(ex1); 2689 ext4_ext_mark_uninitialized(ex1);
2673 ex2 = &newex; 2690 ex2 = &newex;
2674 } 2691 }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2677 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2694 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2678 * overlap of blocks. 2695 * overlap of blocks.
2679 */ 2696 */
2680 if (!ex1 && allocated > max_blocks) 2697 if (!ex1 && allocated > map->m_len)
2681 ex2->ee_len = cpu_to_le16(max_blocks); 2698 ex2->ee_len = cpu_to_le16(map->m_len);
2682 /* ex3: to ee_block + ee_len : uninitialised */ 2699 /* ex3: to ee_block + ee_len : uninitialised */
2683 if (allocated > max_blocks) { 2700 if (allocated > map->m_len) {
2684 unsigned int newdepth; 2701 unsigned int newdepth;
2685 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2702 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2686 if (allocated <= EXT4_EXT_ZERO_LEN) { 2703 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2687 /* 2704 /*
2688 * iblock == ee_block is handled by the zerouout 2705 * map->m_lblk == ee_block is handled by the zerouout
2689 * at the beginning. 2706 * at the beginning.
2690 * Mark first half uninitialized. 2707 * Mark first half uninitialized.
2691 * Mark second half initialized and zero out the 2708 * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2698 ext4_ext_dirty(handle, inode, path + depth); 2715 ext4_ext_dirty(handle, inode, path + depth);
2699 2716
2700 ex3 = &newex; 2717 ex3 = &newex;
2701 ex3->ee_block = cpu_to_le32(iblock); 2718 ex3->ee_block = cpu_to_le32(map->m_lblk);
2702 ext4_ext_store_pblock(ex3, newblock); 2719 ext4_ext_store_pblock(ex3, newblock);
2703 ex3->ee_len = cpu_to_le16(allocated); 2720 ex3->ee_len = cpu_to_le16(allocated);
2704 err = ext4_ext_insert_extent(handle, inode, path, 2721 err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_len = orig_ex.ee_len; 2728 ex->ee_len = orig_ex.ee_len;
2712 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2713 ext4_ext_dirty(handle, inode, path + depth); 2730 ext4_ext_dirty(handle, inode, path + depth);
2714 /* blocks available from iblock */ 2731 /* blocks available from map->m_lblk */
2715 return allocated; 2732 return allocated;
2716 2733
2717 } else if (err) 2734 } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2733 */ 2750 */
2734 depth = ext_depth(inode); 2751 depth = ext_depth(inode);
2735 ext4_ext_drop_refs(path); 2752 ext4_ext_drop_refs(path);
2736 path = ext4_ext_find_extent(inode, 2753 path = ext4_ext_find_extent(inode, map->m_lblk,
2737 iblock, path); 2754 path);
2738 if (IS_ERR(path)) { 2755 if (IS_ERR(path)) {
2739 err = PTR_ERR(path); 2756 err = PTR_ERR(path);
2740 return err; 2757 return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2754 return allocated; 2771 return allocated;
2755 } 2772 }
2756 ex3 = &newex; 2773 ex3 = &newex;
2757 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2774 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2758 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2775 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2759 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2776 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2760 ext4_ext_mark_uninitialized(ex3); 2777 ext4_ext_mark_uninitialized(ex3);
2761 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2778 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2762 if (err == -ENOSPC) { 2779 if (err == -ENOSPC && may_zeroout) {
2763 err = ext4_ext_zeroout(inode, &orig_ex); 2780 err = ext4_ext_zeroout(inode, &orig_ex);
2764 if (err) 2781 if (err)
2765 goto fix_extent_len; 2782 goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2769 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2770 ext4_ext_dirty(handle, inode, path + depth); 2787 ext4_ext_dirty(handle, inode, path + depth);
2771 /* zeroed the full extent */ 2788 /* zeroed the full extent */
2772 /* blocks available from iblock */ 2789 /* blocks available from map->m_lblk */
2773 return allocated; 2790 return allocated;
2774 2791
2775 } else if (err) 2792 } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 * update the extent length after successful insert of the 2800 * update the extent length after successful insert of the
2784 * split extent 2801 * split extent
2785 */ 2802 */
2786 orig_ex.ee_len = cpu_to_le16(ee_len - 2803 ee_len -= ext4_ext_get_actual_len(ex3);
2787 ext4_ext_get_actual_len(ex3)); 2804 orig_ex.ee_len = cpu_to_le16(ee_len);
2805 may_zeroout = ee_block + ee_len <= eof_block;
2806
2788 depth = newdepth; 2807 depth = newdepth;
2789 ext4_ext_drop_refs(path); 2808 ext4_ext_drop_refs(path);
2790 path = ext4_ext_find_extent(inode, iblock, path); 2809 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2791 if (IS_ERR(path)) { 2810 if (IS_ERR(path)) {
2792 err = PTR_ERR(path); 2811 err = PTR_ERR(path);
2793 goto out; 2812 goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2801 if (err) 2820 if (err)
2802 goto out; 2821 goto out;
2803 2822
2804 allocated = max_blocks; 2823 allocated = map->m_len;
2805 2824
2806 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2825 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2807 * to insert a extent in the middle zerout directly 2826 * to insert a extent in the middle zerout directly
2808 * otherwise give the extent a chance to merge to left 2827 * otherwise give the extent a chance to merge to left
2809 */ 2828 */
2810 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2829 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2811 iblock != ee_block) { 2830 map->m_lblk != ee_block && may_zeroout) {
2812 err = ext4_ext_zeroout(inode, &orig_ex); 2831 err = ext4_ext_zeroout(inode, &orig_ex);
2813 if (err) 2832 if (err)
2814 goto fix_extent_len; 2833 goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2818 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2819 ext4_ext_dirty(handle, inode, path + depth); 2838 ext4_ext_dirty(handle, inode, path + depth);
2820 /* zero out the first half */ 2839 /* zero out the first half */
2821 /* blocks available from iblock */ 2840 /* blocks available from map->m_lblk */
2822 return allocated; 2841 return allocated;
2823 } 2842 }
2824 } 2843 }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2829 */ 2848 */
2830 if (ex1 && ex1 != ex) { 2849 if (ex1 && ex1 != ex) {
2831 ex1 = ex; 2850 ex1 = ex;
2832 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2851 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2833 ext4_ext_mark_uninitialized(ex1); 2852 ext4_ext_mark_uninitialized(ex1);
2834 ex2 = &newex; 2853 ex2 = &newex;
2835 } 2854 }
2836 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2855 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2837 ex2->ee_block = cpu_to_le32(iblock); 2856 ex2->ee_block = cpu_to_le32(map->m_lblk);
2838 ext4_ext_store_pblock(ex2, newblock); 2857 ext4_ext_store_pblock(ex2, newblock);
2839 ex2->ee_len = cpu_to_le16(allocated); 2858 ex2->ee_len = cpu_to_le16(allocated);
2840 if (ex2 != ex) 2859 if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2877 goto out; 2896 goto out;
2878insert: 2897insert:
2879 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2898 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2880 if (err == -ENOSPC) { 2899 if (err == -ENOSPC && may_zeroout) {
2881 err = ext4_ext_zeroout(inode, &orig_ex); 2900 err = ext4_ext_zeroout(inode, &orig_ex);
2882 if (err) 2901 if (err)
2883 goto fix_extent_len; 2902 goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
2904} 2923}
2905 2924
2906/* 2925/*
2907 * This function is called by ext4_ext_get_blocks() from 2926 * This function is called by ext4_ext_map_blocks() from
2908 * ext4_get_blocks_dio_write() when DIO to write 2927 * ext4_get_blocks_dio_write() when DIO to write
2909 * to an uninitialized extent. 2928 * to an uninitialized extent.
2910 * 2929 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
2927 */ 2946 */
2928static int ext4_split_unwritten_extents(handle_t *handle, 2947static int ext4_split_unwritten_extents(handle_t *handle,
2929 struct inode *inode, 2948 struct inode *inode,
2949 struct ext4_map_blocks *map,
2930 struct ext4_ext_path *path, 2950 struct ext4_ext_path *path,
2931 ext4_lblk_t iblock,
2932 unsigned int max_blocks,
2933 int flags) 2951 int flags)
2934{ 2952{
2935 struct ext4_extent *ex, newex, orig_ex; 2953 struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2937 struct ext4_extent *ex2 = NULL; 2955 struct ext4_extent *ex2 = NULL;
2938 struct ext4_extent *ex3 = NULL; 2956 struct ext4_extent *ex3 = NULL;
2939 struct ext4_extent_header *eh; 2957 struct ext4_extent_header *eh;
2940 ext4_lblk_t ee_block; 2958 ext4_lblk_t ee_block, eof_block;
2941 unsigned int allocated, ee_len, depth; 2959 unsigned int allocated, ee_len, depth;
2942 ext4_fsblk_t newblock; 2960 ext4_fsblk_t newblock;
2943 int err = 0; 2961 int err = 0;
2962 int may_zeroout;
2963
2964 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2965 "block %llu, max_blocks %u\n", inode->i_ino,
2966 (unsigned long long)map->m_lblk, map->m_len);
2967
2968 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2969 inode->i_sb->s_blocksize_bits;
2970 if (eof_block < map->m_lblk + map->m_len)
2971 eof_block = map->m_lblk + map->m_len;
2944 2972
2945 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2946 "iblock %llu, max_blocks %u\n", inode->i_ino,
2947 (unsigned long long)iblock, max_blocks);
2948 depth = ext_depth(inode); 2973 depth = ext_depth(inode);
2949 eh = path[depth].p_hdr; 2974 eh = path[depth].p_hdr;
2950 ex = path[depth].p_ext; 2975 ex = path[depth].p_ext;
2951 ee_block = le32_to_cpu(ex->ee_block); 2976 ee_block = le32_to_cpu(ex->ee_block);
2952 ee_len = ext4_ext_get_actual_len(ex); 2977 ee_len = ext4_ext_get_actual_len(ex);
2953 allocated = ee_len - (iblock - ee_block); 2978 allocated = ee_len - (map->m_lblk - ee_block);
2954 newblock = iblock - ee_block + ext_pblock(ex); 2979 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2980
2955 ex2 = ex; 2981 ex2 = ex;
2956 orig_ex.ee_block = ex->ee_block; 2982 orig_ex.ee_block = ex->ee_block;
2957 orig_ex.ee_len = cpu_to_le16(ee_len); 2983 orig_ex.ee_len = cpu_to_le16(ee_len);
2958 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2959 2985
2960 /* 2986 /*
2987 * It is safe to convert extent to initialized via explicit
2988 * zeroout only if extent is fully insde i_size or new_size.
2989 */
2990 may_zeroout = ee_block + ee_len <= eof_block;
2991
2992 /*
2961 * If the uninitialized extent begins at the same logical 2993 * If the uninitialized extent begins at the same logical
2962 * block where the write begins, and the write completely 2994 * block where the write begins, and the write completely
2963 * covers the extent, then we don't need to split it. 2995 * covers the extent, then we don't need to split it.
2964 */ 2996 */
2965 if ((iblock == ee_block) && (allocated <= max_blocks)) 2997 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2966 return allocated; 2998 return allocated;
2967 2999
2968 err = ext4_ext_get_access(handle, inode, path + depth); 3000 err = ext4_ext_get_access(handle, inode, path + depth);
2969 if (err) 3001 if (err)
2970 goto out; 3002 goto out;
2971 /* ex1: ee_block to iblock - 1 : uninitialized */ 3003 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2972 if (iblock > ee_block) { 3004 if (map->m_lblk > ee_block) {
2973 ex1 = ex; 3005 ex1 = ex;
2974 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3006 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2975 ext4_ext_mark_uninitialized(ex1); 3007 ext4_ext_mark_uninitialized(ex1);
2976 ex2 = &newex; 3008 ex2 = &newex;
2977 } 3009 }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2980 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3012 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2981 * overlap of blocks. 3013 * overlap of blocks.
2982 */ 3014 */
2983 if (!ex1 && allocated > max_blocks) 3015 if (!ex1 && allocated > map->m_len)
2984 ex2->ee_len = cpu_to_le16(max_blocks); 3016 ex2->ee_len = cpu_to_le16(map->m_len);
2985 /* ex3: to ee_block + ee_len : uninitialised */ 3017 /* ex3: to ee_block + ee_len : uninitialised */
2986 if (allocated > max_blocks) { 3018 if (allocated > map->m_len) {
2987 unsigned int newdepth; 3019 unsigned int newdepth;
2988 ex3 = &newex; 3020 ex3 = &newex;
2989 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 3021 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2990 ext4_ext_store_pblock(ex3, newblock + max_blocks); 3022 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2991 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 3023 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2992 ext4_ext_mark_uninitialized(ex3); 3024 ext4_ext_mark_uninitialized(ex3);
2993 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3025 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2994 if (err == -ENOSPC) { 3026 if (err == -ENOSPC && may_zeroout) {
2995 err = ext4_ext_zeroout(inode, &orig_ex); 3027 err = ext4_ext_zeroout(inode, &orig_ex);
2996 if (err) 3028 if (err)
2997 goto fix_extent_len; 3029 goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3001 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3002 ext4_ext_dirty(handle, inode, path + depth); 3034 ext4_ext_dirty(handle, inode, path + depth);
3003 /* zeroed the full extent */ 3035 /* zeroed the full extent */
3004 /* blocks available from iblock */ 3036 /* blocks available from map->m_lblk */
3005 return allocated; 3037 return allocated;
3006 3038
3007 } else if (err) 3039 } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3015 * update the extent length after successful insert of the 3047 * update the extent length after successful insert of the
3016 * split extent 3048 * split extent
3017 */ 3049 */
3018 orig_ex.ee_len = cpu_to_le16(ee_len - 3050 ee_len -= ext4_ext_get_actual_len(ex3);
3019 ext4_ext_get_actual_len(ex3)); 3051 orig_ex.ee_len = cpu_to_le16(ee_len);
3052 may_zeroout = ee_block + ee_len <= eof_block;
3053
3020 depth = newdepth; 3054 depth = newdepth;
3021 ext4_ext_drop_refs(path); 3055 ext4_ext_drop_refs(path);
3022 path = ext4_ext_find_extent(inode, iblock, path); 3056 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3023 if (IS_ERR(path)) { 3057 if (IS_ERR(path)) {
3024 err = PTR_ERR(path); 3058 err = PTR_ERR(path);
3025 goto out; 3059 goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3033 if (err) 3067 if (err)
3034 goto out; 3068 goto out;
3035 3069
3036 allocated = max_blocks; 3070 allocated = map->m_len;
3037 } 3071 }
3038 /* 3072 /*
3039 * If there was a change of depth as part of the 3073 * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3042 */ 3076 */
3043 if (ex1 && ex1 != ex) { 3077 if (ex1 && ex1 != ex) {
3044 ex1 = ex; 3078 ex1 = ex;
3045 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3079 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3046 ext4_ext_mark_uninitialized(ex1); 3080 ext4_ext_mark_uninitialized(ex1);
3047 ex2 = &newex; 3081 ex2 = &newex;
3048 } 3082 }
3049 /* 3083 /*
3050 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, 3084 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3051 * uninitialised still. 3085 * using direct I/O, uninitialised still.
3052 */ 3086 */
3053 ex2->ee_block = cpu_to_le32(iblock); 3087 ex2->ee_block = cpu_to_le32(map->m_lblk);
3054 ext4_ext_store_pblock(ex2, newblock); 3088 ext4_ext_store_pblock(ex2, newblock);
3055 ex2->ee_len = cpu_to_le16(allocated); 3089 ex2->ee_len = cpu_to_le16(allocated);
3056 ext4_ext_mark_uninitialized(ex2); 3090 ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3062 goto out; 3096 goto out;
3063insert: 3097insert:
3064 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3098 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3065 if (err == -ENOSPC) { 3099 if (err == -ENOSPC && may_zeroout) {
3066 err = ext4_ext_zeroout(inode, &orig_ex); 3100 err = ext4_ext_zeroout(inode, &orig_ex);
3067 if (err) 3101 if (err)
3068 goto fix_extent_len; 3102 goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3152 3186
3153static int 3187static int
3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3188ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3155 ext4_lblk_t iblock, unsigned int max_blocks, 3189 struct ext4_map_blocks *map,
3156 struct ext4_ext_path *path, int flags, 3190 struct ext4_ext_path *path, int flags,
3157 unsigned int allocated, struct buffer_head *bh_result, 3191 unsigned int allocated, ext4_fsblk_t newblock)
3158 ext4_fsblk_t newblock)
3159{ 3192{
3160 int ret = 0; 3193 int ret = 0;
3161 int err = 0; 3194 int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3163 3196
3164 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3197 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3165 "block %llu, max_blocks %u, flags %d, allocated %u", 3198 "block %llu, max_blocks %u, flags %d, allocated %u",
3166 inode->i_ino, (unsigned long long)iblock, max_blocks, 3199 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3167 flags, allocated); 3200 flags, allocated);
3168 ext4_ext_show_leaf(inode, path); 3201 ext4_ext_show_leaf(inode, path);
3169 3202
3170 /* get_block() before submit the IO, split the extent */ 3203 /* get_block() before submit the IO, split the extent */
3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3204 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3172 ret = ext4_split_unwritten_extents(handle, 3205 ret = ext4_split_unwritten_extents(handle, inode, map,
3173 inode, path, iblock, 3206 path, flags);
3174 max_blocks, flags);
3175 /* 3207 /*
3176 * Flag the inode(non aio case) or end_io struct (aio case) 3208 * Flag the inode(non aio case) or end_io struct (aio case)
3177 * that this IO needs to convertion to written when IO is 3209 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3182 else 3214 else
3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode)) 3216 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result); 3217 map->m_flags |= EXT4_MAP_UNINIT;
3186 goto out; 3218 goto out;
3187 } 3219 }
3188 /* IO end_io complete, convert the filled extent to written */ 3220 /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * the buffer head will be unmapped so that 3242 * the buffer head will be unmapped so that
3211 * a read from the block returns 0s. 3243 * a read from the block returns 0s.
3212 */ 3244 */
3213 set_buffer_unwritten(bh_result); 3245 map->m_flags |= EXT4_MAP_UNWRITTEN;
3214 goto out1; 3246 goto out1;
3215 } 3247 }
3216 3248
3217 /* buffered write, writepage time, convert*/ 3249 /* buffered write, writepage time, convert*/
3218 ret = ext4_ext_convert_to_initialized(handle, inode, 3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3219 path, iblock,
3220 max_blocks);
3221 if (ret >= 0) 3251 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1); 3252 ext4_update_inode_fsync_trans(handle, inode, 1);
3223out: 3253out:
@@ -3226,7 +3256,7 @@ out:
3226 goto out2; 3256 goto out2;
3227 } else 3257 } else
3228 allocated = ret; 3258 allocated = ret;
3229 set_buffer_new(bh_result); 3259 map->m_flags |= EXT4_MAP_NEW;
3230 /* 3260 /*
3231 * if we allocated more blocks than requested 3261 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block 3262 * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
3234 * unmapped later when we find the buffer_head marked 3264 * unmapped later when we find the buffer_head marked
3235 * new. 3265 * new.
3236 */ 3266 */
3237 if (allocated > max_blocks) { 3267 if (allocated > map->m_len) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3268 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks, 3269 newblock + map->m_len,
3240 allocated - max_blocks); 3270 allocated - map->m_len);
3241 allocated = max_blocks; 3271 allocated = map->m_len;
3242 } 3272 }
3243 3273
3244 /* 3274 /*
@@ -3252,13 +3282,13 @@ out:
3252 ext4_da_update_reserve_space(inode, allocated, 0); 3282 ext4_da_update_reserve_space(inode, allocated, 0);
3253 3283
3254map_out: 3284map_out:
3255 set_buffer_mapped(bh_result); 3285 map->m_flags |= EXT4_MAP_MAPPED;
3256out1: 3286out1:
3257 if (allocated > max_blocks) 3287 if (allocated > map->m_len)
3258 allocated = max_blocks; 3288 allocated = map->m_len;
3259 ext4_ext_show_leaf(inode, path); 3289 ext4_ext_show_leaf(inode, path);
3260 bh_result->b_bdev = inode->i_sb->s_bdev; 3290 map->m_pblk = newblock;
3261 bh_result->b_blocknr = newblock; 3291 map->m_len = allocated;
3262out2: 3292out2:
3263 if (path) { 3293 if (path) {
3264 ext4_ext_drop_refs(path); 3294 ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
3284 * 3314 *
3285 * return < 0, error case. 3315 * return < 0, error case.
3286 */ 3316 */
3287int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 3317int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3288 ext4_lblk_t iblock, 3318 struct ext4_map_blocks *map, int flags)
3289 unsigned int max_blocks, struct buffer_head *bh_result,
3290 int flags)
3291{ 3319{
3292 struct ext4_ext_path *path = NULL; 3320 struct ext4_ext_path *path = NULL;
3293 struct ext4_extent_header *eh; 3321 struct ext4_extent_header *eh;
3294 struct ext4_extent newex, *ex, *last_ex; 3322 struct ext4_extent newex, *ex, *last_ex;
3295 ext4_fsblk_t newblock; 3323 ext4_fsblk_t newblock;
3296 int err = 0, depth, ret, cache_type; 3324 int i, err = 0, depth, ret, cache_type;
3297 unsigned int allocated = 0; 3325 unsigned int allocated = 0;
3298 struct ext4_allocation_request ar; 3326 struct ext4_allocation_request ar;
3299 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3300 3328
3301 __clear_bit(BH_New, &bh_result->b_state);
3302 ext_debug("blocks %u/%u requested for inode %lu\n", 3329 ext_debug("blocks %u/%u requested for inode %lu\n",
3303 iblock, max_blocks, inode->i_ino); 3330 map->m_lblk, map->m_len, inode->i_ino);
3304 3331
3305 /* check in cache */ 3332 /* check in cache */
3306 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 3333 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3307 if (cache_type) { 3334 if (cache_type) {
3308 if (cache_type == EXT4_EXT_CACHE_GAP) { 3335 if (cache_type == EXT4_EXT_CACHE_GAP) {
3309 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3336 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3316 /* we should allocate requested block */ 3343 /* we should allocate requested block */
3317 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3344 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3318 /* block is already allocated */ 3345 /* block is already allocated */
3319 newblock = iblock 3346 newblock = map->m_lblk
3320 - le32_to_cpu(newex.ee_block) 3347 - le32_to_cpu(newex.ee_block)
3321 + ext_pblock(&newex); 3348 + ext_pblock(&newex);
3322 /* number of remaining blocks in the extent */ 3349 /* number of remaining blocks in the extent */
3323 allocated = ext4_ext_get_actual_len(&newex) - 3350 allocated = ext4_ext_get_actual_len(&newex) -
3324 (iblock - le32_to_cpu(newex.ee_block)); 3351 (map->m_lblk - le32_to_cpu(newex.ee_block));
3325 goto out; 3352 goto out;
3326 } else { 3353 } else {
3327 BUG(); 3354 BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 } 3356 }
3330 3357
3331 /* find extent for this block */ 3358 /* find extent for this block */
3332 path = ext4_ext_find_extent(inode, iblock, NULL); 3359 path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3333 if (IS_ERR(path)) { 3360 if (IS_ERR(path)) {
3334 err = PTR_ERR(path); 3361 err = PTR_ERR(path);
3335 path = NULL; 3362 path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3345 */ 3372 */
3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3373 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address " 3374 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld", 3375 "lblock: %lu, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block); 3376 (unsigned long) map->m_lblk, depth,
3377 path[depth].p_block);
3350 err = -EIO; 3378 err = -EIO;
3351 goto out2; 3379 goto out2;
3352 } 3380 }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3364 */ 3392 */
3365 ee_len = ext4_ext_get_actual_len(ex); 3393 ee_len = ext4_ext_get_actual_len(ex);
3366 /* if found extent covers block, simply return it */ 3394 /* if found extent covers block, simply return it */
3367 if (in_range(iblock, ee_block, ee_len)) { 3395 if (in_range(map->m_lblk, ee_block, ee_len)) {
3368 newblock = iblock - ee_block + ee_start; 3396 newblock = map->m_lblk - ee_block + ee_start;
3369 /* number of remaining blocks in the extent */ 3397 /* number of remaining blocks in the extent */
3370 allocated = ee_len - (iblock - ee_block); 3398 allocated = ee_len - (map->m_lblk - ee_block);
3371 ext_debug("%u fit into %u:%d -> %llu\n", iblock, 3399 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3372 ee_block, ee_len, newblock); 3400 ee_block, ee_len, newblock);
3373 3401
3374 /* Do not put uninitialized extent in the cache */ 3402 /* Do not put uninitialized extent in the cache */
3375 if (!ext4_ext_is_uninitialized(ex)) { 3403 if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3379 goto out; 3407 goto out;
3380 } 3408 }
3381 ret = ext4_ext_handle_uninitialized_extents(handle, 3409 ret = ext4_ext_handle_uninitialized_extents(handle,
3382 inode, iblock, max_blocks, path, 3410 inode, map, path, flags, allocated,
3383 flags, allocated, bh_result, newblock); 3411 newblock);
3384 return ret; 3412 return ret;
3385 } 3413 }
3386 } 3414 }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3394 * put just found gap into cache to speed up 3422 * put just found gap into cache to speed up
3395 * subsequent requests 3423 * subsequent requests
3396 */ 3424 */
3397 ext4_ext_put_gap_in_cache(inode, path, iblock); 3425 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3398 goto out2; 3426 goto out2;
3399 } 3427 }
3400 /* 3428 /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3402 */ 3430 */
3403 3431
3404 /* find neighbour allocated blocks */ 3432 /* find neighbour allocated blocks */
3405 ar.lleft = iblock; 3433 ar.lleft = map->m_lblk;
3406 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3434 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3407 if (err) 3435 if (err)
3408 goto out2; 3436 goto out2;
3409 ar.lright = iblock; 3437 ar.lright = map->m_lblk;
3410 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3438 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3411 if (err) 3439 if (err)
3412 goto out2; 3440 goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3417 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3445 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3418 * EXT_UNINIT_MAX_LEN. 3446 * EXT_UNINIT_MAX_LEN.
3419 */ 3447 */
3420 if (max_blocks > EXT_INIT_MAX_LEN && 3448 if (map->m_len > EXT_INIT_MAX_LEN &&
3421 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3449 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3422 max_blocks = EXT_INIT_MAX_LEN; 3450 map->m_len = EXT_INIT_MAX_LEN;
3423 else if (max_blocks > EXT_UNINIT_MAX_LEN && 3451 else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3424 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3452 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3425 max_blocks = EXT_UNINIT_MAX_LEN; 3453 map->m_len = EXT_UNINIT_MAX_LEN;
3426 3454
3427 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 3455 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3428 newex.ee_block = cpu_to_le32(iblock); 3456 newex.ee_block = cpu_to_le32(map->m_lblk);
3429 newex.ee_len = cpu_to_le16(max_blocks); 3457 newex.ee_len = cpu_to_le16(map->m_len);
3430 err = ext4_ext_check_overlap(inode, &newex, path); 3458 err = ext4_ext_check_overlap(inode, &newex, path);
3431 if (err) 3459 if (err)
3432 allocated = ext4_ext_get_actual_len(&newex); 3460 allocated = ext4_ext_get_actual_len(&newex);
3433 else 3461 else
3434 allocated = max_blocks; 3462 allocated = map->m_len;
3435 3463
3436 /* allocate new block */ 3464 /* allocate new block */
3437 ar.inode = inode; 3465 ar.inode = inode;
3438 ar.goal = ext4_ext_find_goal(inode, path, iblock); 3466 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3439 ar.logical = iblock; 3467 ar.logical = map->m_lblk;
3440 ar.len = allocated; 3468 ar.len = allocated;
3441 if (S_ISREG(inode->i_mode)) 3469 if (S_ISREG(inode->i_mode))
3442 ar.flags = EXT4_MB_HINT_DATA; 3470 ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3498 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3499 }
3472 if (ext4_should_dioread_nolock(inode)) 3500 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result); 3501 map->m_flags |= EXT4_MAP_UNINIT;
3474 } 3502 }
3475 3503
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3477 if (unlikely(!eh->eh_entries)) { 3505 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode, 3506 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d", 3507 "eh->eh_entries == 0 and "
3480 ex->ee_block); 3508 "EOFBLOCKS_FL set");
3481 err = -EIO; 3509 err = -EIO;
3482 goto out2; 3510 goto out2;
3483 } 3511 }
3484 last_ex = EXT_LAST_EXTENT(eh); 3512 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) 3513 /*
3486 + ext4_ext_get_actual_len(last_ex)) 3514 * If the current leaf block was reached by looking at
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 3515 * the last index block all the way down the tree, and
3516 * we are extending the inode beyond the last extent
3517 * in the current leaf block, then clear the
3518 * EOFBLOCKS_FL flag.
3519 */
3520 for (i = depth-1; i >= 0; i--) {
3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3522 break;
3523 }
3524 if ((i < 0) &&
3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3526 ext4_ext_get_actual_len(last_ex)))
3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3488 } 3528 }
3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3490 if (err) { 3530 if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3500 /* previous routine could use block we allocated */ 3540 /* previous routine could use block we allocated */
3501 newblock = ext_pblock(&newex); 3541 newblock = ext_pblock(&newex);
3502 allocated = ext4_ext_get_actual_len(&newex); 3542 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks) 3543 if (allocated > map->m_len)
3504 allocated = max_blocks; 3544 allocated = map->m_len;
3505 set_buffer_new(bh_result); 3545 map->m_flags |= EXT4_MAP_NEW;
3506 3546
3507 /* 3547 /*
3508 * Update reserved blocks/metadata blocks after successful 3548 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3516 * when it is _not_ an uninitialized extent. 3556 * when it is _not_ an uninitialized extent.
3517 */ 3557 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3558 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3559 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3520 EXT4_EXT_CACHE_EXTENT); 3560 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1); 3561 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else 3562 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0); 3563 ext4_update_inode_fsync_trans(handle, inode, 0);
3524out: 3564out:
3525 if (allocated > max_blocks) 3565 if (allocated > map->m_len)
3526 allocated = max_blocks; 3566 allocated = map->m_len;
3527 ext4_ext_show_leaf(inode, path); 3567 ext4_ext_show_leaf(inode, path);
3528 set_buffer_mapped(bh_result); 3568 map->m_flags |= EXT4_MAP_MAPPED;
3529 bh_result->b_bdev = inode->i_sb->s_bdev; 3569 map->m_pblk = newblock;
3530 bh_result->b_blocknr = newblock; 3570 map->m_len = allocated;
3531out2: 3571out2:
3532 if (path) { 3572 if (path) {
3533 ext4_ext_drop_refs(path); 3573 ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
3625 * can proceed even if the new size is the same as i_size. 3665 * can proceed even if the new size is the same as i_size.
3626 */ 3666 */
3627 if (new_size > i_size_read(inode)) 3667 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; 3668 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3629 } 3669 }
3630 3670
3631} 3671}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
3640long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3680long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3641{ 3681{
3642 handle_t *handle; 3682 handle_t *handle;
3643 ext4_lblk_t block;
3644 loff_t new_size; 3683 loff_t new_size;
3645 unsigned int max_blocks; 3684 unsigned int max_blocks;
3646 int ret = 0; 3685 int ret = 0;
3647 int ret2 = 0; 3686 int ret2 = 0;
3648 int retries = 0; 3687 int retries = 0;
3649 struct buffer_head map_bh; 3688 struct ext4_map_blocks map;
3650 unsigned int credits, blkbits = inode->i_blkbits; 3689 unsigned int credits, blkbits = inode->i_blkbits;
3651 3690
3652 /* 3691 /*
3653 * currently supporting (pre)allocate mode for extent-based 3692 * currently supporting (pre)allocate mode for extent-based
3654 * files _only_ 3693 * files _only_
3655 */ 3694 */
3656 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3695 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3696 return -EOPNOTSUPP;
3658 3697
3659 /* preallocation to directories is currently not supported */ 3698 /* preallocation to directories is currently not supported */
3660 if (S_ISDIR(inode->i_mode)) 3699 if (S_ISDIR(inode->i_mode))
3661 return -ENODEV; 3700 return -ENODEV;
3662 3701
3663 block = offset >> blkbits; 3702 map.m_lblk = offset >> blkbits;
3664 /* 3703 /*
3665 * We can't just convert len to max_blocks because 3704 * We can't just convert len to max_blocks because
3666 * If blocksize = 4096 offset = 3072 and len = 2048 3705 * If blocksize = 4096 offset = 3072 and len = 2048
3667 */ 3706 */
3668 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3707 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3669 - block; 3708 - map.m_lblk;
3670 /* 3709 /*
3671 * credits to insert 1 extent into extent tree 3710 * credits to insert 1 extent into extent tree
3672 */ 3711 */
3673 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3712 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3674 mutex_lock(&inode->i_mutex); 3713 mutex_lock(&inode->i_mutex);
3714 ret = inode_newsize_ok(inode, (len + offset));
3715 if (ret) {
3716 mutex_unlock(&inode->i_mutex);
3717 return ret;
3718 }
3675retry: 3719retry:
3676 while (ret >= 0 && ret < max_blocks) { 3720 while (ret >= 0 && ret < max_blocks) {
3677 block = block + ret; 3721 map.m_lblk = map.m_lblk + ret;
3678 max_blocks = max_blocks - ret; 3722 map.m_len = max_blocks = max_blocks - ret;
3679 handle = ext4_journal_start(inode, credits); 3723 handle = ext4_journal_start(inode, credits);
3680 if (IS_ERR(handle)) { 3724 if (IS_ERR(handle)) {
3681 ret = PTR_ERR(handle); 3725 ret = PTR_ERR(handle);
3682 break; 3726 break;
3683 } 3727 }
3684 map_bh.b_state = 0; 3728 ret = ext4_map_blocks(handle, inode, &map,
3685 ret = ext4_get_blocks(handle, inode, block,
3686 max_blocks, &map_bh,
3687 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3729 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3688 if (ret <= 0) { 3730 if (ret <= 0) {
3689#ifdef EXT4FS_DEBUG 3731#ifdef EXT4FS_DEBUG
3690 WARN_ON(ret <= 0); 3732 WARN_ON(ret <= 0);
3691 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3733 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3692 "returned error inode#%lu, block=%u, " 3734 "returned error inode#%lu, block=%u, "
3693 "max_blocks=%u", __func__, 3735 "max_blocks=%u", __func__,
3694 inode->i_ino, block, max_blocks); 3736 inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
3697 ret2 = ext4_journal_stop(handle); 3739 ret2 = ext4_journal_stop(handle);
3698 break; 3740 break;
3699 } 3741 }
3700 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3742 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3701 blkbits) >> blkbits)) 3743 blkbits) >> blkbits))
3702 new_size = offset + len; 3744 new_size = offset + len;
3703 else 3745 else
3704 new_size = (block + ret) << blkbits; 3746 new_size = (map.m_lblk + ret) << blkbits;
3705 3747
3706 ext4_falloc_update_inode(inode, mode, new_size, 3748 ext4_falloc_update_inode(inode, mode, new_size,
3707 buffer_new(&map_bh)); 3749 (map.m_flags & EXT4_MAP_NEW));
3708 ext4_mark_inode_dirty(handle, inode); 3750 ext4_mark_inode_dirty(handle, inode);
3709 ret2 = ext4_journal_stop(handle); 3751 ret2 = ext4_journal_stop(handle);
3710 if (ret2) 3752 if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3733 ssize_t len) 3775 ssize_t len)
3734{ 3776{
3735 handle_t *handle; 3777 handle_t *handle;
3736 ext4_lblk_t block;
3737 unsigned int max_blocks; 3778 unsigned int max_blocks;
3738 int ret = 0; 3779 int ret = 0;
3739 int ret2 = 0; 3780 int ret2 = 0;
3740 struct buffer_head map_bh; 3781 struct ext4_map_blocks map;
3741 unsigned int credits, blkbits = inode->i_blkbits; 3782 unsigned int credits, blkbits = inode->i_blkbits;
3742 3783
3743 block = offset >> blkbits; 3784 map.m_lblk = offset >> blkbits;
3744 /* 3785 /*
3745 * We can't just convert len to max_blocks because 3786 * We can't just convert len to max_blocks because
3746 * If blocksize = 4096 offset = 3072 and len = 2048 3787 * If blocksize = 4096 offset = 3072 and len = 2048
3747 */ 3788 */
3748 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3789 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3749 - block; 3790 map.m_lblk);
3750 /* 3791 /*
3751 * credits to insert 1 extent into extent tree 3792 * credits to insert 1 extent into extent tree
3752 */ 3793 */
3753 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3794 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3754 while (ret >= 0 && ret < max_blocks) { 3795 while (ret >= 0 && ret < max_blocks) {
3755 block = block + ret; 3796 map.m_lblk += ret;
3756 max_blocks = max_blocks - ret; 3797 map.m_len = (max_blocks -= ret);
3757 handle = ext4_journal_start(inode, credits); 3798 handle = ext4_journal_start(inode, credits);
3758 if (IS_ERR(handle)) { 3799 if (IS_ERR(handle)) {
3759 ret = PTR_ERR(handle); 3800 ret = PTR_ERR(handle);
3760 break; 3801 break;
3761 } 3802 }
3762 map_bh.b_state = 0; 3803 ret = ext4_map_blocks(handle, inode, &map,
3763 ret = ext4_get_blocks(handle, inode, block,
3764 max_blocks, &map_bh,
3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3804 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3766 if (ret <= 0) { 3805 if (ret <= 0) {
3767 WARN_ON(ret <= 0); 3806 WARN_ON(ret <= 0);
3768 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3807 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3769 "returned error inode#%lu, block=%u, " 3808 "returned error inode#%lu, block=%u, "
3770 "max_blocks=%u", __func__, 3809 "max_blocks=%u", __func__,
3771 inode->i_ino, block, max_blocks); 3810 inode->i_ino, map.m_lblk, map.m_len);
3772 } 3811 }
3773 ext4_mark_inode_dirty(handle, inode); 3812 ext4_mark_inode_dirty(handle, inode);
3774 ret2 = ext4_journal_stop(handle); 3813 ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3898 int error = 0; 3937 int error = 0;
3899 3938
3900 /* fallback to generic here if not in extents fmt */ 3939 /* fallback to generic here if not in extents fmt */
3901 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3940 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3902 return generic_block_fiemap(inode, fieinfo, start, len, 3941 return generic_block_fiemap(inode, fieinfo, start, len,
3903 ext4_get_block); 3942 ext4_get_block);
3904 3943
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
66 * is smaller than s_maxbytes, which is for extent-mapped files. 66 * is smaller than s_maxbytes, which is for extent-mapped files.
67 */ 67 */
68 68
69 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 69 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..b6a74f991bf4 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37/* 37/*
38 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since
40 * otherwise it will only be written by writeback, leaving a huge
41 * window during which a crash may lose the file. This may apply for
42 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created.
44 */
45static void ext4_sync_parent(struct inode *inode)
46{
47 struct dentry *dentry = NULL;
48
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
51 dentry = list_entry(inode->i_dentry.next,
52 struct dentry, d_alias);
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break;
55 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping);
57 }
58}
59
60/*
38 * akpm: A new design for ext4_sync_file(). 61 * akpm: A new design for ext4_sync_file().
39 * 62 *
40 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). 63 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
66 ret = flush_completed_IO(inode); 89 ret = flush_completed_IO(inode);
67 if (ret < 0) 90 if (ret < 0)
68 return ret; 91 return ret;
69 92
70 if (!journal) 93 if (!journal) {
71 return simple_fsync(file, dentry, datasync); 94 ret = simple_fsync(file, dentry, datasync);
95 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode);
97 return ret;
98 }
72 99
73 /* 100 /*
74 * data=writeback,ordered: 101 * data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
102 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT); 131 NULL, BLKDEV_IFL_WAIT);
105 jbd2_log_wait_commit(journal, commit_tid); 132 ret = jbd2_log_wait_commit(journal, commit_tid);
106 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT); 135 BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
240 if (fatal) 240 if (fatal)
241 goto error_return; 241 goto error_return;
242 242
243 /* Ok, now we can actually update the inode bitmaps.. */ 243 fatal = -ESRCH;
244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 gdp = ext4_get_group_desc(sb, block_group, &bh2);
245 bit, bitmap_bh->b_data); 245 if (gdp) {
246 if (!cleared)
247 ext4_error(sb, "bit already cleared for inode %lu", ino);
248 else {
249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
250
251 BUFFER_TRACE(bh2, "get_write_access"); 246 BUFFER_TRACE(bh2, "get_write_access");
252 fatal = ext4_journal_get_write_access(handle, bh2); 247 fatal = ext4_journal_get_write_access(handle, bh2);
253 if (fatal) goto error_return; 248 }
254 249 ext4_lock_group(sb, block_group);
255 if (gdp) { 250 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
256 ext4_lock_group(sb, block_group); 251 if (fatal || !cleared) {
257 count = ext4_free_inodes_count(sb, gdp) + 1; 252 ext4_unlock_group(sb, block_group);
258 ext4_free_inodes_set(sb, gdp, count); 253 goto out;
259 if (is_directory) { 254 }
260 count = ext4_used_dirs_count(sb, gdp) - 1;
261 ext4_used_dirs_set(sb, gdp, count);
262 if (sbi->s_log_groups_per_flex) {
263 ext4_group_t f;
264
265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 }
268 255
269 } 256 count = ext4_free_inodes_count(sb, gdp) + 1;
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 257 ext4_free_inodes_set(sb, gdp, count);
271 block_group, gdp); 258 if (is_directory) {
272 ext4_unlock_group(sb, block_group); 259 count = ext4_used_dirs_count(sb, gdp) - 1;
273 percpu_counter_inc(&sbi->s_freeinodes_counter); 260 ext4_used_dirs_set(sb, gdp, count);
274 if (is_directory) 261 percpu_counter_dec(&sbi->s_dirs_counter);
275 percpu_counter_dec(&sbi->s_dirs_counter);
276
277 if (sbi->s_log_groups_per_flex) {
278 ext4_group_t f;
279
280 f = ext4_flex_group(sbi, block_group);
281 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
282 }
283 }
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
286 if (!fatal) fatal = err;
287 } 262 }
288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); 263 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 264 ext4_unlock_group(sb, block_group);
290 if (!fatal) 265
291 fatal = err; 266 percpu_counter_inc(&sbi->s_freeinodes_counter);
292 sb->s_dirt = 1; 267 if (sbi->s_log_groups_per_flex) {
268 ext4_group_t f = ext4_flex_group(sbi, block_group);
269
270 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
271 if (is_directory)
272 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
273 }
274 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
275 fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
276out:
277 if (cleared) {
278 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal)
281 fatal = err;
282 sb->s_dirt = 1;
283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285
293error_return: 286error_return:
294 brelse(bitmap_bh); 287 brelse(bitmap_bh);
295 ext4_std_error(sb, fatal); 288 ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
499 492
500 if (S_ISDIR(mode) && 493 if (S_ISDIR(mode) &&
501 ((parent == sb->s_root->d_inode) || 494 ((parent == sb->s_root->d_inode) ||
502 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { 495 (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
503 int best_ndir = inodes_per_group; 496 int best_ndir = inodes_per_group;
504 int ret = -1; 497 int ret = -1;
505 498
@@ -1041,7 +1034,7 @@ got:
1041 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1042 /* set extent flag only for directory, file and normal symlink*/ 1035 /* set extent flag only for directory, file and normal symlink*/
1043 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 1036 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1044 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 1037 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1045 ext4_ext_tree_init(handle, inode); 1038 ext4_ext_tree_init(handle, inode);
1046 } 1039 }
1047 } 1040 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..19df61c321fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
149 int ret; 149 int ret;
150 150
151 /* 151 /*
152 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this 152 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
153 * moment, get_block can be called only for blocks inside i_size since 153 * moment, get_block can be called only for blocks inside i_size since
154 * page cache has been already dropped and writes are blocked by 154 * page cache has been already dropped and writes are blocked by
155 * i_mutex. So we can safely drop the i_data_sem here. 155 * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 348 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 350 blk, 1))) {
351 __ext4_error(inode->i_sb, function, 351 ext4_error_inode(function, inode,
352 "invalid block reference %u " 352 "invalid block reference %u", blk);
353 "in inode #%lu", blk, inode->i_ino);
354 return -EIO; 353 return -EIO;
355 } 354 }
356 } 355 }
@@ -785,7 +784,7 @@ failed:
785 /* Allocation failed, free what we already allocated */ 784 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 785 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
787 for (i = 1; i <= n ; i++) { 786 for (i = 1; i <= n ; i++) {
788 /* 787 /*
789 * branch[i].bh is newly allocated, so there is no 788 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't 789 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA. 790 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
875 874
876err_out: 875err_out:
877 for (i = 1; i <= num; i++) { 876 for (i = 1; i <= num; i++) {
878 /* 877 /*
879 * branch[i].bh is newly allocated, so there is no 878 * branch[i].bh is newly allocated, so there is no
880 * need to revoke the block, which is why we don't 879 * need to revoke the block, which is why we don't
881 * need to set EXT4_FREE_BLOCKS_METADATA. 880 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
890} 889}
891 890
892/* 891/*
893 * The ext4_ind_get_blocks() function handles non-extents inodes 892 * The ext4_ind_map_blocks() function handles non-extents inodes
894 * (i.e., using the traditional indirect/double-indirect i_blocks 893 * (i.e., using the traditional indirect/double-indirect i_blocks
895 * scheme) for ext4_get_blocks(). 894 * scheme) for ext4_map_blocks().
896 * 895 *
897 * Allocation strategy is simple: if we have to allocate something, we will 896 * Allocation strategy is simple: if we have to allocate something, we will
898 * have to go the whole way to leaf. So let's do it before attaching anything 897 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
917 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 916 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
918 * blocks. 917 * blocks.
919 */ 918 */
920static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 919static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
921 ext4_lblk_t iblock, unsigned int maxblocks, 920 struct ext4_map_blocks *map,
922 struct buffer_head *bh_result,
923 int flags) 921 int flags)
924{ 922{
925 int err = -EIO; 923 int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
933 int count = 0; 931 int count = 0;
934 ext4_fsblk_t first_block = 0; 932 ext4_fsblk_t first_block = 0;
935 933
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 934 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
937 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 935 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 936 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
939 &blocks_to_boundary); 937 &blocks_to_boundary);
940 938
941 if (depth == 0) 939 if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
946 /* Simplest case - block found, no allocation needed */ 944 /* Simplest case - block found, no allocation needed */
947 if (!partial) { 945 if (!partial) {
948 first_block = le32_to_cpu(chain[depth - 1].key); 946 first_block = le32_to_cpu(chain[depth - 1].key);
949 clear_buffer_new(bh_result);
950 count++; 947 count++;
951 /*map more blocks*/ 948 /*map more blocks*/
952 while (count < maxblocks && count <= blocks_to_boundary) { 949 while (count < map->m_len && count <= blocks_to_boundary) {
953 ext4_fsblk_t blk; 950 ext4_fsblk_t blk;
954 951
955 blk = le32_to_cpu(*(chain[depth-1].p + count)); 952 blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
969 /* 966 /*
970 * Okay, we need to do block allocation. 967 * Okay, we need to do block allocation.
971 */ 968 */
972 goal = ext4_find_goal(inode, iblock, partial); 969 goal = ext4_find_goal(inode, map->m_lblk, partial);
973 970
974 /* the number of blocks need to allocate for [d,t]indirect blocks */ 971 /* the number of blocks need to allocate for [d,t]indirect blocks */
975 indirect_blks = (chain + depth) - partial - 1; 972 indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
979 * direct blocks to allocate for this branch. 976 * direct blocks to allocate for this branch.
980 */ 977 */
981 count = ext4_blks_to_allocate(partial, indirect_blks, 978 count = ext4_blks_to_allocate(partial, indirect_blks,
982 maxblocks, blocks_to_boundary); 979 map->m_len, blocks_to_boundary);
983 /* 980 /*
984 * Block out ext4_truncate while we alter the tree 981 * Block out ext4_truncate while we alter the tree
985 */ 982 */
986 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 983 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
987 &count, goal, 984 &count, goal,
988 offsets + (partial - chain), partial); 985 offsets + (partial - chain), partial);
989 986
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
995 * may need to return -EAGAIN upwards in the worst case. --sct 992 * may need to return -EAGAIN upwards in the worst case. --sct
996 */ 993 */
997 if (!err) 994 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 995 err = ext4_splice_branch(handle, inode, map->m_lblk,
999 partial, indirect_blks, count); 996 partial, indirect_blks, count);
1000 if (err) 997 if (err)
1001 goto cleanup; 998 goto cleanup;
1002 999
1003 set_buffer_new(bh_result); 1000 map->m_flags |= EXT4_MAP_NEW;
1004 1001
1005 ext4_update_inode_fsync_trans(handle, inode, 1); 1002 ext4_update_inode_fsync_trans(handle, inode, 1);
1006got_it: 1003got_it:
1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1004 map->m_flags |= EXT4_MAP_MAPPED;
1005 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1006 map->m_len = count;
1008 if (count > blocks_to_boundary) 1007 if (count > blocks_to_boundary)
1009 set_buffer_boundary(bh_result); 1008 map->m_flags |= EXT4_MAP_BOUNDARY;
1010 err = count; 1009 err = count;
1011 /* Clean up and exit */ 1010 /* Clean up and exit */
1012 partial = chain + depth - 1; /* the whole chain */ 1011 partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
1016 brelse(partial->bh); 1015 brelse(partial->bh);
1017 partial--; 1016 partial--;
1018 } 1017 }
1019 BUFFER_TRACE(bh_result, "returned");
1020out: 1018out:
1021 return err; 1019 return err;
1022} 1020}
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1061 */ 1059 */
1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1060static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1063{ 1061{
1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1062 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1065 return ext4_ext_calc_metadata_amount(inode, lblock); 1063 return ext4_ext_calc_metadata_amount(inode, lblock);
1066 1064
1067 return ext4_indirect_calc_metadata_amount(inode, lblock); 1065 return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1076{ 1074{
1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1075 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1078 struct ext4_inode_info *ei = EXT4_I(inode); 1076 struct ext4_inode_info *ei = EXT4_I(inode);
1079 int mdb_free = 0, allocated_meta_blocks = 0;
1080 1077
1081 spin_lock(&ei->i_block_reservation_lock); 1078 spin_lock(&ei->i_block_reservation_lock);
1082 trace_ext4_da_update_reserve_space(inode, used); 1079 trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1091 1088
1092 /* Update per-inode reservations */ 1089 /* Update per-inode reservations */
1093 ei->i_reserved_data_blocks -= used; 1090 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1091 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1092 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1093 used + ei->i_allocated_meta_blocks);
1097 ei->i_allocated_meta_blocks = 0; 1094 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099 1095
1100 if (ei->i_reserved_data_blocks == 0) { 1096 if (ei->i_reserved_data_blocks == 0) {
1101 /* 1097 /*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1103 * only when we have written all of the delayed 1099 * only when we have written all of the delayed
1104 * allocation blocks. 1100 * allocation blocks.
1105 */ 1101 */
1106 mdb_free = ei->i_reserved_meta_blocks; 1102 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1103 ei->i_reserved_meta_blocks);
1107 ei->i_reserved_meta_blocks = 0; 1104 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0; 1105 ei->i_da_metadata_calc_len = 0;
1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1110 } 1106 }
1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1107 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1112 1108
1113 /* Update quota subsystem */ 1109 /* Update quota subsystem for data blocks */
1114 if (quota_claim) { 1110 if (quota_claim)
1115 dquot_claim_block(inode, used); 1111 dquot_claim_block(inode, used);
1116 if (mdb_free) 1112 else {
1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /* 1113 /*
1120 * We did fallocate with an offset that is already delayed 1114 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should 1115 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then 1116 * not re-claim the quota for fallocated blocks.
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */ 1117 */
1127 if (allocated_meta_blocks) 1118 dquot_release_reservation_block(inode, used);
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 } 1119 }
1131 1120
1132 /* 1121 /*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
1139 ext4_discard_preallocations(inode); 1128 ext4_discard_preallocations(inode);
1140} 1129}
1141 1130
1142static int check_block_validity(struct inode *inode, const char *msg, 1131static int check_block_validity(struct inode *inode, const char *func,
1143 sector_t logical, sector_t phys, int len) 1132 struct ext4_map_blocks *map)
1144{ 1133{
1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1146 __ext4_error(inode->i_sb, msg, 1135 map->m_len)) {
1147 "inode #%lu logical block %llu mapped to %llu " 1136 ext4_error_inode(func, inode,
1148 "(size %d)", inode->i_ino, 1137 "lblock %lu mapped to illegal pblock %llu "
1149 (unsigned long long) logical, 1138 "(length %d)", (unsigned long) map->m_lblk,
1150 (unsigned long long) phys, len); 1139 map->m_pblk, map->m_len);
1151 return -EIO; 1140 return -EIO;
1152 } 1141 }
1153 return 0; 1142 return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1212} 1201}
1213 1202
1214/* 1203/*
1215 * The ext4_get_blocks() function tries to look up the requested blocks, 1204 * The ext4_map_blocks() function tries to look up the requested blocks,
1216 * and returns if the blocks are already mapped. 1205 * and returns if the blocks are already mapped.
1217 * 1206 *
1218 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1207 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1219 * and store the allocated blocks in the result buffer head and mark it 1208 * and store the allocated blocks in the result buffer head and mark it
1220 * mapped. 1209 * mapped.
1221 * 1210 *
1222 * If file type is extents based, it will call ext4_ext_get_blocks(), 1211 * If file type is extents based, it will call ext4_ext_map_blocks(),
1223 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1212 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1224 * based files 1213 * based files
1225 * 1214 *
1226 * On success, it returns the number of blocks being mapped or allocate. 1215 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1233 * 1222 *
1234 * It returns the error in case of allocation failure. 1223 * It returns the error in case of allocation failure.
1235 */ 1224 */
1236int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1225int ext4_map_blocks(handle_t *handle, struct inode *inode,
1237 unsigned int max_blocks, struct buffer_head *bh, 1226 struct ext4_map_blocks *map, int flags)
1238 int flags)
1239{ 1227{
1240 int retval; 1228 int retval;
1241 1229
1242 clear_buffer_mapped(bh); 1230 map->m_flags = 0;
1243 clear_buffer_unwritten(bh); 1231 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1244 1232 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1245 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," 1233 (unsigned long) map->m_lblk);
1246 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1247 (unsigned long)block);
1248 /* 1234 /*
1249 * Try to see if we can get the block without requesting a new 1235 * Try to see if we can get the block without requesting a new
1250 * file system block. 1236 * file system block.
1251 */ 1237 */
1252 down_read((&EXT4_I(inode)->i_data_sem)); 1238 down_read((&EXT4_I(inode)->i_data_sem));
1253 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1239 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1254 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1240 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1255 bh, 0);
1256 } else { 1241 } else {
1257 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1242 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1258 bh, 0);
1259 } 1243 }
1260 up_read((&EXT4_I(inode)->i_data_sem)); 1244 up_read((&EXT4_I(inode)->i_data_sem));
1261 1245
1262 if (retval > 0 && buffer_mapped(bh)) { 1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1263 int ret = check_block_validity(inode, "file system corruption", 1247 int ret = check_block_validity(inode, __func__, map);
1264 block, bh->b_blocknr, retval);
1265 if (ret != 0) 1248 if (ret != 0)
1266 return ret; 1249 return ret;
1267 } 1250 }
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1277 * ext4_ext_get_block() returns th create = 0 1260 * ext4_ext_get_block() returns th create = 0
1278 * with buffer head unmapped. 1261 * with buffer head unmapped.
1279 */ 1262 */
1280 if (retval > 0 && buffer_mapped(bh)) 1263 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1281 return retval; 1264 return retval;
1282 1265
1283 /* 1266 /*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1290 * of BH_Unwritten and BH_Mapped flags being simultaneously 1273 * of BH_Unwritten and BH_Mapped flags being simultaneously
1291 * set on the buffer_head. 1274 * set on the buffer_head.
1292 */ 1275 */
1293 clear_buffer_unwritten(bh); 1276 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1294 1277
1295 /* 1278 /*
1296 * New blocks allocate and/or writing to uninitialized extent 1279 * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1312 * We need to check for EXT4 here because migrate 1295 * We need to check for EXT4 here because migrate
1313 * could have changed the inode type in between 1296 * could have changed the inode type in between
1314 */ 1297 */
1315 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1298 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1316 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1299 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1317 bh, flags);
1318 } else { 1300 } else {
1319 retval = ext4_ind_get_blocks(handle, inode, block, 1301 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1320 max_blocks, bh, flags);
1321 1302
1322 if (retval > 0 && buffer_new(bh)) { 1303 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1323 /* 1304 /*
1324 * We allocated new blocks which will result in 1305 * We allocated new blocks which will result in
1325 * i_data's format changing. Force the migrate 1306 * i_data's format changing. Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1323 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1343 1324
1344 up_write((&EXT4_I(inode)->i_data_sem)); 1325 up_write((&EXT4_I(inode)->i_data_sem));
1345 if (retval > 0 && buffer_mapped(bh)) { 1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1346 int ret = check_block_validity(inode, "file system " 1327 int ret = check_block_validity(inode,
1347 "corruption after allocation", 1328 "ext4_map_blocks_after_alloc",
1348 block, bh->b_blocknr, retval); 1329 map);
1349 if (ret != 0) 1330 if (ret != 0)
1350 return ret; 1331 return ret;
1351 } 1332 }
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1355/* Maximum number of blocks we map for direct IO at once. */ 1336/* Maximum number of blocks we map for direct IO at once. */
1356#define DIO_MAX_BLOCKS 4096 1337#define DIO_MAX_BLOCKS 4096
1357 1338
1358int ext4_get_block(struct inode *inode, sector_t iblock, 1339static int _ext4_get_block(struct inode *inode, sector_t iblock,
1359 struct buffer_head *bh_result, int create) 1340 struct buffer_head *bh, int flags)
1360{ 1341{
1361 handle_t *handle = ext4_journal_current_handle(); 1342 handle_t *handle = ext4_journal_current_handle();
1343 struct ext4_map_blocks map;
1362 int ret = 0, started = 0; 1344 int ret = 0, started = 0;
1363 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1364 int dio_credits; 1345 int dio_credits;
1365 1346
1366 if (create && !handle) { 1347 map.m_lblk = iblock;
1348 map.m_len = bh->b_size >> inode->i_blkbits;
1349
1350 if (flags && !handle) {
1367 /* Direct IO write... */ 1351 /* Direct IO write... */
1368 if (max_blocks > DIO_MAX_BLOCKS) 1352 if (map.m_len > DIO_MAX_BLOCKS)
1369 max_blocks = DIO_MAX_BLOCKS; 1353 map.m_len = DIO_MAX_BLOCKS;
1370 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1354 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1371 handle = ext4_journal_start(inode, dio_credits); 1355 handle = ext4_journal_start(inode, dio_credits);
1372 if (IS_ERR(handle)) { 1356 if (IS_ERR(handle)) {
1373 ret = PTR_ERR(handle); 1357 ret = PTR_ERR(handle);
1374 goto out; 1358 return ret;
1375 } 1359 }
1376 started = 1; 1360 started = 1;
1377 } 1361 }
1378 1362
1379 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1363 ret = ext4_map_blocks(handle, inode, &map, flags);
1380 create ? EXT4_GET_BLOCKS_CREATE : 0);
1381 if (ret > 0) { 1364 if (ret > 0) {
1382 bh_result->b_size = (ret << inode->i_blkbits); 1365 map_bh(bh, inode->i_sb, map.m_pblk);
1366 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1367 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1383 ret = 0; 1368 ret = 0;
1384 } 1369 }
1385 if (started) 1370 if (started)
1386 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1387out:
1388 return ret; 1372 return ret;
1389} 1373}
1390 1374
1375int ext4_get_block(struct inode *inode, sector_t iblock,
1376 struct buffer_head *bh, int create)
1377{
1378 return _ext4_get_block(inode, iblock, bh,
1379 create ? EXT4_GET_BLOCKS_CREATE : 0);
1380}
1381
1391/* 1382/*
1392 * `handle' can be NULL if create is zero 1383 * `handle' can be NULL if create is zero
1393 */ 1384 */
1394struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1385struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1395 ext4_lblk_t block, int create, int *errp) 1386 ext4_lblk_t block, int create, int *errp)
1396{ 1387{
1397 struct buffer_head dummy; 1388 struct ext4_map_blocks map;
1389 struct buffer_head *bh;
1398 int fatal = 0, err; 1390 int fatal = 0, err;
1399 int flags = 0;
1400 1391
1401 J_ASSERT(handle != NULL || create == 0); 1392 J_ASSERT(handle != NULL || create == 0);
1402 1393
1403 dummy.b_state = 0; 1394 map.m_lblk = block;
1404 dummy.b_blocknr = -1000; 1395 map.m_len = 1;
1405 buffer_trace_init(&dummy.b_history); 1396 err = ext4_map_blocks(handle, inode, &map,
1406 if (create) 1397 create ? EXT4_GET_BLOCKS_CREATE : 0);
1407 flags |= EXT4_GET_BLOCKS_CREATE; 1398
1408 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1399 if (err < 0)
1409 /* 1400 *errp = err;
1410 * ext4_get_blocks() returns number of blocks mapped. 0 in 1401 if (err <= 0)
1411 * case of a HOLE. 1402 return NULL;
1412 */ 1403 *errp = 0;
1413 if (err > 0) { 1404
1414 if (err > 1) 1405 bh = sb_getblk(inode->i_sb, map.m_pblk);
1415 WARN_ON(1); 1406 if (!bh) {
1416 err = 0; 1407 *errp = -EIO;
1408 return NULL;
1417 } 1409 }
1418 *errp = err; 1410 if (map.m_flags & EXT4_MAP_NEW) {
1419 if (!err && buffer_mapped(&dummy)) { 1411 J_ASSERT(create != 0);
1420 struct buffer_head *bh; 1412 J_ASSERT(handle != NULL);
1421 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1422 if (!bh) {
1423 *errp = -EIO;
1424 goto err;
1425 }
1426 if (buffer_new(&dummy)) {
1427 J_ASSERT(create != 0);
1428 J_ASSERT(handle != NULL);
1429 1413
1430 /* 1414 /*
1431 * Now that we do not always journal data, we should 1415 * Now that we do not always journal data, we should
1432 * keep in mind whether this should always journal the 1416 * keep in mind whether this should always journal the
1433 * new buffer as metadata. For now, regular file 1417 * new buffer as metadata. For now, regular file
1434 * writes use ext4_get_block instead, so it's not a 1418 * writes use ext4_get_block instead, so it's not a
1435 * problem. 1419 * problem.
1436 */ 1420 */
1437 lock_buffer(bh); 1421 lock_buffer(bh);
1438 BUFFER_TRACE(bh, "call get_create_access"); 1422 BUFFER_TRACE(bh, "call get_create_access");
1439 fatal = ext4_journal_get_create_access(handle, bh); 1423 fatal = ext4_journal_get_create_access(handle, bh);
1440 if (!fatal && !buffer_uptodate(bh)) { 1424 if (!fatal && !buffer_uptodate(bh)) {
1441 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1425 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1442 set_buffer_uptodate(bh); 1426 set_buffer_uptodate(bh);
1443 }
1444 unlock_buffer(bh);
1445 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1446 err = ext4_handle_dirty_metadata(handle, inode, bh);
1447 if (!fatal)
1448 fatal = err;
1449 } else {
1450 BUFFER_TRACE(bh, "not a new buffer");
1451 }
1452 if (fatal) {
1453 *errp = fatal;
1454 brelse(bh);
1455 bh = NULL;
1456 } 1427 }
1457 return bh; 1428 unlock_buffer(bh);
1429 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1430 err = ext4_handle_dirty_metadata(handle, inode, bh);
1431 if (!fatal)
1432 fatal = err;
1433 } else {
1434 BUFFER_TRACE(bh, "not a new buffer");
1458 } 1435 }
1459err: 1436 if (fatal) {
1460 return NULL; 1437 *errp = fatal;
1438 brelse(bh);
1439 bh = NULL;
1440 }
1441 return bh;
1461} 1442}
1462 1443
1463struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1444struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1860 int retries = 0; 1841 int retries = 0;
1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1842 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1862 struct ext4_inode_info *ei = EXT4_I(inode); 1843 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved; 1844 unsigned long md_needed;
1864 int ret; 1845 int ret;
1865 1846
1866 /* 1847 /*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1870 */ 1851 */
1871repeat: 1852repeat:
1872 spin_lock(&ei->i_block_reservation_lock); 1853 spin_lock(&ei->i_block_reservation_lock);
1873 md_reserved = ei->i_reserved_meta_blocks;
1874 md_needed = ext4_calc_metadata_amount(inode, lblock); 1854 md_needed = ext4_calc_metadata_amount(inode, lblock);
1875 trace_ext4_da_reserve_space(inode, md_needed); 1855 trace_ext4_da_reserve_space(inode, md_needed);
1876 spin_unlock(&ei->i_block_reservation_lock); 1856 spin_unlock(&ei->i_block_reservation_lock);
1877 1857
1878 /* 1858 /*
1879 * Make quota reservation here to prevent quota overflow 1859 * We will charge metadata quota at writeout time; this saves
1880 * later. Real quota accounting is done at pages writeout 1860 * us from metadata over-estimation, though we may go over by
1881 * time. 1861 * a small amount in the end. Here we just reserve for data.
1882 */ 1862 */
1883 ret = dquot_reserve_block(inode, md_needed + 1); 1863 ret = dquot_reserve_block(inode, 1);
1884 if (ret) 1864 if (ret)
1885 return ret; 1865 return ret;
1886 1866 /*
1867 * We do still charge estimated metadata to the sb though;
1868 * we cannot afford to run out of free blocks.
1869 */
1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1870 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1888 dquot_release_reservation_block(inode, md_needed + 1); 1871 dquot_release_reservation_block(inode, 1);
1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1872 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1890 yield(); 1873 yield();
1891 goto repeat; 1874 goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1910 1893
1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1894 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1912 1895
1896 trace_ext4_da_release_space(inode, to_free);
1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1897 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1914 /* 1898 /*
1915 * if there aren't enough reserved blocks, then the 1899 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1932 * only when we have written all of the delayed 1916 * only when we have written all of the delayed
1933 * allocation blocks. 1917 * allocation blocks.
1934 */ 1918 */
1935 to_free += ei->i_reserved_meta_blocks; 1919 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1920 ei->i_reserved_meta_blocks);
1936 ei->i_reserved_meta_blocks = 0; 1921 ei->i_reserved_meta_blocks = 0;
1937 ei->i_da_metadata_calc_len = 0; 1922 ei->i_da_metadata_calc_len = 0;
1938 } 1923 }
1939 1924
1940 /* update fs dirty blocks counter */ 1925 /* update fs dirty data blocks counter */
1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1926 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1942 1927
1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1928 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2042/* 2027/*
2043 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2028 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2044 * 2029 *
2045 * @mpd->inode - inode to walk through
2046 * @exbh->b_blocknr - first block on a disk
2047 * @exbh->b_size - amount of space in bytes
2048 * @logical - first logical block to start assignment with
2049 *
2050 * the function goes through all passed space and put actual disk 2030 * the function goes through all passed space and put actual disk
2051 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 2031 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2052 */ 2032 */
2053static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 2033static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2054 struct buffer_head *exbh) 2034 struct ext4_map_blocks *map)
2055{ 2035{
2056 struct inode *inode = mpd->inode; 2036 struct inode *inode = mpd->inode;
2057 struct address_space *mapping = inode->i_mapping; 2037 struct address_space *mapping = inode->i_mapping;
2058 int blocks = exbh->b_size >> inode->i_blkbits; 2038 int blocks = map->m_len;
2059 sector_t pblock = exbh->b_blocknr, cur_logical; 2039 sector_t pblock = map->m_pblk, cur_logical;
2060 struct buffer_head *head, *bh; 2040 struct buffer_head *head, *bh;
2061 pgoff_t index, end; 2041 pgoff_t index, end;
2062 struct pagevec pvec; 2042 struct pagevec pvec;
2063 int nr_pages, i; 2043 int nr_pages, i;
2064 2044
2065 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2045 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2046 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2067 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2047 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2068 2048
2069 pagevec_init(&pvec, 0); 2049 pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2090 2070
2091 /* skip blocks out of the range */ 2071 /* skip blocks out of the range */
2092 do { 2072 do {
2093 if (cur_logical >= logical) 2073 if (cur_logical >= map->m_lblk)
2094 break; 2074 break;
2095 cur_logical++; 2075 cur_logical++;
2096 } while ((bh = bh->b_this_page) != head); 2076 } while ((bh = bh->b_this_page) != head);
2097 2077
2098 do { 2078 do {
2099 if (cur_logical >= logical + blocks) 2079 if (cur_logical >= map->m_lblk + blocks)
2100 break; 2080 break;
2101 2081
2102 if (buffer_delay(bh) || 2082 if (buffer_delay(bh) || buffer_unwritten(bh)) {
2103 buffer_unwritten(bh)) {
2104 2083
2105 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2084 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2106 2085
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2119 } else if (buffer_mapped(bh)) 2098 } else if (buffer_mapped(bh))
2120 BUG_ON(bh->b_blocknr != pblock); 2099 BUG_ON(bh->b_blocknr != pblock);
2121 2100
2122 if (buffer_uninit(exbh)) 2101 if (map->m_flags & EXT4_MAP_UNINIT)
2123 set_buffer_uninit(bh); 2102 set_buffer_uninit(bh);
2124 cur_logical++; 2103 cur_logical++;
2125 pblock++; 2104 pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2130} 2109}
2131 2110
2132 2111
2133/*
2134 * __unmap_underlying_blocks - just a helper function to unmap
2135 * set of blocks described by @bh
2136 */
2137static inline void __unmap_underlying_blocks(struct inode *inode,
2138 struct buffer_head *bh)
2139{
2140 struct block_device *bdev = inode->i_sb->s_bdev;
2141 int blocks, i;
2142
2143 blocks = bh->b_size >> inode->i_blkbits;
2144 for (i = 0; i < blocks; i++)
2145 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
2146}
2147
2148static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2112static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2149 sector_t logical, long blk_cnt) 2113 sector_t logical, long blk_cnt)
2150{ 2114{
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
2206static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2170static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2207{ 2171{
2208 int err, blks, get_blocks_flags; 2172 int err, blks, get_blocks_flags;
2209 struct buffer_head new; 2173 struct ext4_map_blocks map;
2210 sector_t next = mpd->b_blocknr; 2174 sector_t next = mpd->b_blocknr;
2211 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2175 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2212 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2176 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2247 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2211 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2248 * variables are updated after the blocks have been allocated. 2212 * variables are updated after the blocks have been allocated.
2249 */ 2213 */
2250 new.b_state = 0; 2214 map.m_lblk = next;
2215 map.m_len = max_blocks;
2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2216 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2252 if (ext4_should_dioread_nolock(mpd->inode)) 2217 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2218 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2254 if (mpd->b_state & (1 << BH_Delay)) 2219 if (mpd->b_state & (1 << BH_Delay))
2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2220 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256 2221
2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2258 &new, get_blocks_flags);
2259 if (blks < 0) { 2223 if (blks < 0) {
2260 err = blks; 2224 err = blks;
2261 /* 2225 /*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2282 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2283 "delayed block allocation failed for inode %lu at " 2247 "delayed block allocation failed for inode %lu at "
2284 "logical offset %llu with max blocks %zd with " 2248 "logical offset %llu with max blocks %zd with "
2285 "error %d\n", mpd->inode->i_ino, 2249 "error %d", mpd->inode->i_ino,
2286 (unsigned long long) next, 2250 (unsigned long long) next,
2287 mpd->b_size >> mpd->inode->i_blkbits, err); 2251 mpd->b_size >> mpd->inode->i_blkbits, err);
2288 printk(KERN_CRIT "This should not happen!! " 2252 printk(KERN_CRIT "This should not happen!! "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2297 } 2261 }
2298 BUG_ON(blks == 0); 2262 BUG_ON(blks == 0);
2299 2263
2300 new.b_size = (blks << mpd->inode->i_blkbits); 2264 if (map.m_flags & EXT4_MAP_NEW) {
2265 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2266 int i;
2301 2267
2302 if (buffer_new(&new)) 2268 for (i = 0; i < map.m_len; i++)
2303 __unmap_underlying_blocks(mpd->inode, &new); 2269 unmap_underlying_metadata(bdev, map.m_pblk + i);
2270 }
2304 2271
2305 /* 2272 /*
2306 * If blocks are delayed marked, we need to 2273 * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2308 */ 2275 */
2309 if ((mpd->b_state & (1 << BH_Delay)) || 2276 if ((mpd->b_state & (1 << BH_Delay)) ||
2310 (mpd->b_state & (1 << BH_Unwritten))) 2277 (mpd->b_state & (1 << BH_Unwritten)))
2311 mpage_put_bnr_to_bhs(mpd, next, &new); 2278 mpage_put_bnr_to_bhs(mpd, &map);
2312 2279
2313 if (ext4_should_order_data(mpd->inode)) { 2280 if (ext4_should_order_data(mpd->inode)) {
2314 err = ext4_jbd2_file_inode(handle, mpd->inode); 2281 err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2349 sector_t next; 2316 sector_t next;
2350 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2317 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2351 2318
2319 /*
2320 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop
2324 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it;
2327
2352 /* check if thereserved journal credits might overflow */ 2328 /* check if thereserved journal credits might overflow */
2353 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2329 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2354 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2330 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2355 /* 2331 /*
2356 * With non-extent format we are limited by the journal 2332 * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
2423 struct buffer_head *bh, *head; 2399 struct buffer_head *bh, *head;
2424 sector_t logical; 2400 sector_t logical;
2425 2401
2426 if (mpd->io_done) {
2427 /*
2428 * Rest of the page in the page_vec
2429 * redirty then and skip then. We will
2430 * try to write them again after
2431 * starting a new transaction
2432 */
2433 redirty_page_for_writepage(wbc, page);
2434 unlock_page(page);
2435 return MPAGE_DA_EXTENT_TAIL;
2436 }
2437 /* 2402 /*
2438 * Can we merge this page to current extent? 2403 * Can we merge this page to current extent?
2439 */ 2404 */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
2528 * initialized properly. 2493 * initialized properly.
2529 */ 2494 */
2530static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2495static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2531 struct buffer_head *bh_result, int create) 2496 struct buffer_head *bh, int create)
2532{ 2497{
2498 struct ext4_map_blocks map;
2533 int ret = 0; 2499 int ret = 0;
2534 sector_t invalid_block = ~((sector_t) 0xffff); 2500 sector_t invalid_block = ~((sector_t) 0xffff);
2535 2501
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2537 invalid_block = ~0; 2503 invalid_block = ~0;
2538 2504
2539 BUG_ON(create == 0); 2505 BUG_ON(create == 0);
2540 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2506 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2507
2508 map.m_lblk = iblock;
2509 map.m_len = 1;
2541 2510
2542 /* 2511 /*
2543 * first, we need to know whether the block is allocated already 2512 * first, we need to know whether the block is allocated already
2544 * preallocated blocks are unmapped but should treated 2513 * preallocated blocks are unmapped but should treated
2545 * the same as allocated blocks. 2514 * the same as allocated blocks.
2546 */ 2515 */
2547 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2516 ret = ext4_map_blocks(NULL, inode, &map, 0);
2548 if ((ret == 0) && !buffer_delay(bh_result)) { 2517 if (ret < 0)
2549 /* the block isn't (pre)allocated yet, let's reserve space */ 2518 return ret;
2519 if (ret == 0) {
2520 if (buffer_delay(bh))
2521 return 0; /* Not sure this could or should happen */
2550 /* 2522 /*
2551 * XXX: __block_prepare_write() unmaps passed block, 2523 * XXX: __block_prepare_write() unmaps passed block,
2552 * is it OK? 2524 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2556 /* not enough space to reserve */ 2528 /* not enough space to reserve */
2557 return ret; 2529 return ret;
2558 2530
2559 map_bh(bh_result, inode->i_sb, invalid_block); 2531 map_bh(bh, inode->i_sb, invalid_block);
2560 set_buffer_new(bh_result); 2532 set_buffer_new(bh);
2561 set_buffer_delay(bh_result); 2533 set_buffer_delay(bh);
2562 } else if (ret > 0) { 2534 return 0;
2563 bh_result->b_size = (ret << inode->i_blkbits);
2564 if (buffer_unwritten(bh_result)) {
2565 /* A delayed write to unwritten bh should
2566 * be marked new and mapped. Mapped ensures
2567 * that we don't do get_block multiple times
2568 * when we write to the same offset and new
2569 * ensures that we do proper zero out for
2570 * partial write.
2571 */
2572 set_buffer_new(bh_result);
2573 set_buffer_mapped(bh_result);
2574 }
2575 ret = 0;
2576 } 2535 }
2577 2536
2578 return ret; 2537 map_bh(bh, inode->i_sb, map.m_pblk);
2538 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2539
2540 if (buffer_unwritten(bh)) {
2541 /* A delayed write to unwritten bh should be marked
2542 * new and mapped. Mapped ensures that we don't do
2543 * get_block multiple times when we write to the same
2544 * offset and new ensures that we do proper zero out
2545 * for partial write.
2546 */
2547 set_buffer_new(bh);
2548 set_buffer_mapped(bh);
2549 }
2550 return 0;
2579} 2551}
2580 2552
2581/* 2553/*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2597static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2598 struct buffer_head *bh_result, int create) 2570 struct buffer_head *bh_result, int create)
2599{ 2571{
2600 int ret = 0;
2601 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2602
2603 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2572 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2604 2573 return _ext4_get_block(inode, iblock, bh_result, 0);
2605 /*
2606 * we don't want to do block allocation in writepage
2607 * so call get_block_wrap with create = 0
2608 */
2609 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2610 if (ret > 0) {
2611 bh_result->b_size = (ret << inode->i_blkbits);
2612 ret = 0;
2613 }
2614 return ret;
2615} 2574}
2616 2575
2617static int bget_one(handle_t *handle, struct buffer_head *bh) 2576static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2821 * number of contiguous block. So we will limit 2780 * number of contiguous block. So we will limit
2822 * number of contiguous block to a sane value 2781 * number of contiguous block to a sane value
2823 */ 2782 */
2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2783 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2825 (max_blocks > EXT4_MAX_TRANS_DATA)) 2784 (max_blocks > EXT4_MAX_TRANS_DATA))
2826 max_blocks = EXT4_MAX_TRANS_DATA; 2785 max_blocks = EXT4_MAX_TRANS_DATA;
2827 2786
2828 return ext4_chunk_trans_blocks(inode, max_blocks); 2787 return ext4_chunk_trans_blocks(inode, max_blocks);
2829} 2788}
2830 2789
2790/*
2791 * write_cache_pages_da - walk the list of dirty pages of the given
2792 * address space and call the callback function (which usually writes
2793 * the pages).
2794 *
2795 * This is a forked version of write_cache_pages(). Differences:
2796 * Range cyclic is ignored.
2797 * no_nrwrite_index_update is always presumed true
2798 */
2799static int write_cache_pages_da(struct address_space *mapping,
2800 struct writeback_control *wbc,
2801 struct mpage_da_data *mpd)
2802{
2803 int ret = 0;
2804 int done = 0;
2805 struct pagevec pvec;
2806 int nr_pages;
2807 pgoff_t index;
2808 pgoff_t end; /* Inclusive */
2809 long nr_to_write = wbc->nr_to_write;
2810
2811 pagevec_init(&pvec, 0);
2812 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2813 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2814
2815 while (!done && (index <= end)) {
2816 int i;
2817
2818 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2819 PAGECACHE_TAG_DIRTY,
2820 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2821 if (nr_pages == 0)
2822 break;
2823
2824 for (i = 0; i < nr_pages; i++) {
2825 struct page *page = pvec.pages[i];
2826
2827 /*
2828 * At this point, the page may be truncated or
2829 * invalidated (changing page->mapping to NULL), or
2830 * even swizzled back from swapper_space to tmpfs file
2831 * mapping. However, page->index will not change
2832 * because we have a reference on the page.
2833 */
2834 if (page->index > end) {
2835 done = 1;
2836 break;
2837 }
2838
2839 lock_page(page);
2840
2841 /*
2842 * Page truncated or invalidated. We can freely skip it
2843 * then, even for data integrity operations: the page
2844 * has disappeared concurrently, so there could be no
2845 * real expectation of this data interity operation
2846 * even if there is now a new, dirty page at the same
2847 * pagecache address.
2848 */
2849 if (unlikely(page->mapping != mapping)) {
2850continue_unlock:
2851 unlock_page(page);
2852 continue;
2853 }
2854
2855 if (!PageDirty(page)) {
2856 /* someone wrote it for us */
2857 goto continue_unlock;
2858 }
2859
2860 if (PageWriteback(page)) {
2861 if (wbc->sync_mode != WB_SYNC_NONE)
2862 wait_on_page_writeback(page);
2863 else
2864 goto continue_unlock;
2865 }
2866
2867 BUG_ON(PageWriteback(page));
2868 if (!clear_page_dirty_for_io(page))
2869 goto continue_unlock;
2870
2871 ret = __mpage_da_writepage(page, wbc, mpd);
2872 if (unlikely(ret)) {
2873 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2874 unlock_page(page);
2875 ret = 0;
2876 } else {
2877 done = 1;
2878 break;
2879 }
2880 }
2881
2882 if (nr_to_write > 0) {
2883 nr_to_write--;
2884 if (nr_to_write == 0 &&
2885 wbc->sync_mode == WB_SYNC_NONE) {
2886 /*
2887 * We stop writing back only if we are
2888 * not doing integrity sync. In case of
2889 * integrity sync we have to keep going
2890 * because someone may be concurrently
2891 * dirtying pages, and we might have
2892 * synced a lot of newly appeared dirty
2893 * pages, but have not synced all of the
2894 * old dirty pages.
2895 */
2896 done = 1;
2897 break;
2898 }
2899 }
2900 }
2901 pagevec_release(&pvec);
2902 cond_resched();
2903 }
2904 return ret;
2905}
2906
2907
2831static int ext4_da_writepages(struct address_space *mapping, 2908static int ext4_da_writepages(struct address_space *mapping,
2832 struct writeback_control *wbc) 2909 struct writeback_control *wbc)
2833{ 2910{
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2836 handle_t *handle = NULL; 2913 handle_t *handle = NULL;
2837 struct mpage_da_data mpd; 2914 struct mpage_da_data mpd;
2838 struct inode *inode = mapping->host; 2915 struct inode *inode = mapping->host;
2839 int no_nrwrite_index_update;
2840 int pages_written = 0; 2916 int pages_written = 0;
2841 long pages_skipped; 2917 long pages_skipped;
2842 unsigned int max_pages; 2918 unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2916 mpd.wbc = wbc; 2992 mpd.wbc = wbc;
2917 mpd.inode = mapping->host; 2993 mpd.inode = mapping->host;
2918 2994
2919 /*
2920 * we don't want write_cache_pages to update
2921 * nr_to_write and writeback_index
2922 */
2923 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2924 wbc->no_nrwrite_index_update = 1;
2925 pages_skipped = wbc->pages_skipped; 2995 pages_skipped = wbc->pages_skipped;
2926 2996
2927retry: 2997retry:
@@ -2941,7 +3011,7 @@ retry:
2941 if (IS_ERR(handle)) { 3011 if (IS_ERR(handle)) {
2942 ret = PTR_ERR(handle); 3012 ret = PTR_ERR(handle);
2943 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 3013 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2944 "%ld pages, ino %lu; err %d\n", __func__, 3014 "%ld pages, ino %lu; err %d", __func__,
2945 wbc->nr_to_write, inode->i_ino, ret); 3015 wbc->nr_to_write, inode->i_ino, ret);
2946 goto out_writepages; 3016 goto out_writepages;
2947 } 3017 }
@@ -2963,8 +3033,7 @@ retry:
2963 mpd.io_done = 0; 3033 mpd.io_done = 0;
2964 mpd.pages_written = 0; 3034 mpd.pages_written = 0;
2965 mpd.retval = 0; 3035 mpd.retval = 0;
2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 3036 ret = write_cache_pages_da(mapping, wbc, &mpd);
2967 &mpd);
2968 /* 3037 /*
2969 * If we have a contiguous extent of pages and we 3038 * If we have a contiguous extent of pages and we
2970 * haven't done the I/O yet, map the blocks and submit 3039 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
3016 if (pages_skipped != wbc->pages_skipped) 3085 if (pages_skipped != wbc->pages_skipped)
3017 ext4_msg(inode->i_sb, KERN_CRIT, 3086 ext4_msg(inode->i_sb, KERN_CRIT,
3018 "This should not happen leaving %s " 3087 "This should not happen leaving %s "
3019 "with nr_to_write = %ld ret = %d\n", 3088 "with nr_to_write = %ld ret = %d",
3020 __func__, wbc->nr_to_write, ret); 3089 __func__, wbc->nr_to_write, ret);
3021 3090
3022 /* Update index */ 3091 /* Update index */
@@ -3030,8 +3099,6 @@ retry:
3030 mapping->writeback_index = index; 3099 mapping->writeback_index = index;
3031 3100
3032out_writepages: 3101out_writepages:
3033 if (!no_nrwrite_index_update)
3034 wbc->no_nrwrite_index_update = 0;
3035 wbc->nr_to_write -= nr_to_writebump; 3102 wbc->nr_to_write -= nr_to_writebump;
3036 wbc->range_start = range_start; 3103 wbc->range_start = range_start;
3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3104 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3076 loff_t pos, unsigned len, unsigned flags, 3143 loff_t pos, unsigned len, unsigned flags,
3077 struct page **pagep, void **fsdata) 3144 struct page **pagep, void **fsdata)
3078{ 3145{
3079 int ret, retries = 0, quota_retries = 0; 3146 int ret, retries = 0;
3080 struct page *page; 3147 struct page *page;
3081 pgoff_t index; 3148 pgoff_t index;
3082 unsigned from, to; 3149 unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
3135 3202
3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3203 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3137 goto retry; 3204 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3154out: 3205out:
3155 return ret; 3206 return ret;
3156} 3207}
@@ -3546,46 +3597,18 @@ out:
3546 return ret; 3597 return ret;
3547} 3598}
3548 3599
3600/*
3601 * ext4_get_block used when preparing for a DIO write or buffer write.
3602 * We allocate an uinitialized extent if blocks haven't been allocated.
3603 * The extent will be converted to initialized after the IO is complete.
3604 */
3549static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3605static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3550 struct buffer_head *bh_result, int create) 3606 struct buffer_head *bh_result, int create)
3551{ 3607{
3552 handle_t *handle = ext4_journal_current_handle();
3553 int ret = 0;
3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3555 int dio_credits;
3556 int started = 0;
3557
3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3608 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3559 inode->i_ino, create); 3609 inode->i_ino, create);
3560 /* 3610 return _ext4_get_block(inode, iblock, bh_result,
3561 * ext4_get_block in prepare for a DIO write or buffer write. 3611 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3563 * The extent will be converted to initialized after IO complete.
3564 */
3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3566
3567 if (!handle) {
3568 if (max_blocks > DIO_MAX_BLOCKS)
3569 max_blocks = DIO_MAX_BLOCKS;
3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3571 handle = ext4_journal_start(inode, dio_credits);
3572 if (IS_ERR(handle)) {
3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3577 }
3578
3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3580 create);
3581 if (ret > 0) {
3582 bh_result->b_size = (ret << inode->i_blkbits);
3583 ret = 0;
3584 }
3585 if (started)
3586 ext4_journal_stop(handle);
3587out:
3588 return ret;
3589} 3612}
3590 3613
3591static void dump_completed_IO(struct inode * inode) 3614static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3973 struct file *file = iocb->ki_filp; 3996 struct file *file = iocb->ki_filp;
3974 struct inode *inode = file->f_mapping->host; 3997 struct inode *inode = file->f_mapping->host;
3975 3998
3976 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 3999 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 4000 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3978 4001
3979 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 4002 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4302 4325
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4326 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) { 4327 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: " 4328 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4306 "attempt to clear blocks %llu len %lu, invalid", 4329 "blocks %llu len %lu",
4307 inode->i_ino, (unsigned long long) block_to_free, 4330 (unsigned long long) block_to_free, count);
4308 count);
4309 return 1; 4331 return 1;
4310 } 4332 }
4311 4333
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4432 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4411 ext4_handle_dirty_metadata(handle, inode, this_bh); 4433 ext4_handle_dirty_metadata(handle, inode, this_bh);
4412 else 4434 else
4413 ext4_error(inode->i_sb, 4435 EXT4_ERROR_INODE(inode,
4414 "circular indirect block detected, " 4436 "circular indirect block detected at "
4415 "inode=%lu, block=%llu", 4437 "block %llu",
4416 inode->i_ino, 4438 (unsigned long long) this_bh->b_blocknr);
4417 (unsigned long long) this_bh->b_blocknr);
4418 } 4439 }
4419} 4440}
4420 4441
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4452 4473
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4474 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) { 4475 nr, 1)) {
4455 ext4_error(inode->i_sb, 4476 EXT4_ERROR_INODE(inode,
4456 "indirect mapped block in inode " 4477 "invalid indirect mapped "
4457 "#%lu invalid (level %d, blk #%lu)", 4478 "block %lu (level %d)",
4458 inode->i_ino, depth, 4479 (unsigned long) nr, depth);
4459 (unsigned long) nr);
4460 break; 4480 break;
4461 } 4481 }
4462 4482
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4468 * (should be rare). 4488 * (should be rare).
4469 */ 4489 */
4470 if (!bh) { 4490 if (!bh) {
4471 ext4_error(inode->i_sb, 4491 EXT4_ERROR_INODE(inode,
4472 "Read failure, inode=%lu, block=%llu", 4492 "Read failure block=%llu",
4473 inode->i_ino, nr); 4493 (unsigned long long) nr);
4474 continue; 4494 continue;
4475 } 4495 }
4476 4496
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
4612 if (!ext4_can_truncate(inode)) 4632 if (!ext4_can_truncate(inode))
4613 return; 4633 return;
4614 4634
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 4635 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4616 4636
4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4637 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4638 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4619 4639
4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4640 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4621 ext4_ext_truncate(inode); 4641 ext4_ext_truncate(inode);
4622 return; 4642 return;
4623 } 4643 }
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4785 4805
4786 bh = sb_getblk(sb, block); 4806 bh = sb_getblk(sb, block);
4787 if (!bh) { 4807 if (!bh) {
4788 ext4_error(sb, "unable to read inode block - " 4808 EXT4_ERROR_INODE(inode, "unable to read inode block - "
4789 "inode=%lu, block=%llu", inode->i_ino, block); 4809 "block %llu", block);
4790 return -EIO; 4810 return -EIO;
4791 } 4811 }
4792 if (!buffer_uptodate(bh)) { 4812 if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
4884 submit_bh(READ_META, bh); 4904 submit_bh(READ_META, bh);
4885 wait_on_buffer(bh); 4905 wait_on_buffer(bh);
4886 if (!buffer_uptodate(bh)) { 4906 if (!buffer_uptodate(bh)) {
4887 ext4_error(sb, "unable to read inode block - inode=%lu," 4907 EXT4_ERROR_INODE(inode, "unable to read inode "
4888 " block=%llu", inode->i_ino, block); 4908 "block %llu", block);
4889 brelse(bh); 4909 brelse(bh);
4890 return -EIO; 4910 return -EIO;
4891 } 4911 }
@@ -5096,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5096 ret = 0; 5116 ret = 0;
5097 if (ei->i_file_acl && 5117 if (ei->i_file_acl &&
5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5118 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu", 5119 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5100 ei->i_file_acl, inode->i_ino); 5120 ei->i_file_acl);
5101 ret = -EIO; 5121 ret = -EIO;
5102 goto bad_inode; 5122 goto bad_inode;
5103 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5123 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5162 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5143 } else { 5163 } else {
5144 ret = -EIO; 5164 ret = -EIO;
5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", 5165 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5146 inode->i_mode, inode->i_ino);
5147 goto bad_inode; 5166 goto bad_inode;
5148 } 5167 }
5149 brelse(iloc.bh); 5168 brelse(iloc.bh);
@@ -5381,9 +5400,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5381 if (wbc->sync_mode == WB_SYNC_ALL) 5400 if (wbc->sync_mode == WB_SYNC_ALL)
5382 sync_dirty_buffer(iloc.bh); 5401 sync_dirty_buffer(iloc.bh);
5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5402 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5384 ext4_error(inode->i_sb, "IO error syncing inode, " 5403 EXT4_ERROR_INODE(inode,
5385 "inode=%lu, block=%llu", inode->i_ino, 5404 "IO error syncing inode (block=%llu)",
5386 (unsigned long long)iloc.bh->b_blocknr); 5405 (unsigned long long) iloc.bh->b_blocknr);
5387 err = -EIO; 5406 err = -EIO;
5388 } 5407 }
5389 brelse(iloc.bh); 5408 brelse(iloc.bh);
@@ -5455,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5455 } 5474 }
5456 5475
5457 if (attr->ia_valid & ATTR_SIZE) { 5476 if (attr->ia_valid & ATTR_SIZE) {
5458 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 5477 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5459 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5478 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5460 5479
5461 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5480 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5468 if (S_ISREG(inode->i_mode) && 5487 if (S_ISREG(inode->i_mode) &&
5469 attr->ia_valid & ATTR_SIZE && 5488 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size || 5489 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { 5490 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5472 handle_t *handle; 5491 handle_t *handle;
5473 5492
5474 handle = ext4_journal_start(inode, 3); 5493 handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5500 } 5519 }
5501 } 5520 }
5502 /* ext4_truncate will clear the flag */ 5521 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) 5522 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5504 ext4_truncate(inode); 5523 ext4_truncate(inode);
5505 } 5524 }
5506 5525
@@ -5576,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5576 5595
5577static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5596static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5578{ 5597{
5579 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5598 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5580 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5599 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5581 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5600 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5582} 5601}
@@ -5911,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5911 */ 5930 */
5912 5931
5913 if (val) 5932 if (val)
5914 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5933 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5915 else 5934 else
5916 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5935 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5917 ext4_set_aops(inode); 5936 ext4_set_aops(inode);
5918 5937
5919 jbd2_journal_unlock_updates(journal); 5938 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
258 if (me.moved_len > 0) 258 if (me.moved_len > 0)
259 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
260 260
261 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me))) 262 &me, sizeof(me)))
263 err = -EFAULT; 263 err = -EFAULT;
264mext_out: 264mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
373 case EXT4_IOC32_SETRSVSZ: 373 case EXT4_IOC32_SETRSVSZ:
374 cmd = EXT4_IOC_SETRSVSZ; 374 cmd = EXT4_IOC_SETRSVSZ;
375 break; 375 break;
376 case EXT4_IOC_GROUP_ADD: 376 case EXT4_IOC32_GROUP_ADD: {
377 struct compat_ext4_new_group_input __user *uinput;
378 struct ext4_new_group_input input;
379 mm_segment_t old_fs;
380 int err;
381
382 uinput = compat_ptr(arg);
383 err = get_user(input.group, &uinput->group);
384 err |= get_user(input.block_bitmap, &uinput->block_bitmap);
385 err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
386 err |= get_user(input.inode_table, &uinput->inode_table);
387 err |= get_user(input.blocks_count, &uinput->blocks_count);
388 err |= get_user(input.reserved_blocks,
389 &uinput->reserved_blocks);
390 if (err)
391 return -EFAULT;
392 old_fs = get_fs();
393 set_fs(KERNEL_DS);
394 err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
395 (unsigned long) &input);
396 set_fs(old_fs);
397 return err;
398 }
399 case EXT4_IOC_MOVE_EXT:
377 break; 400 break;
378 default: 401 default:
379 return -ENOIOCTLCMD; 402 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
658 } 658 }
659} 659}
660 660
661/*
662 * Cache the order of the largest free extent we have available in this block
663 * group.
664 */
665static void
666mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
667{
668 int i;
669 int bits;
670
671 grp->bb_largest_free_order = -1; /* uninit */
672
673 bits = sb->s_blocksize_bits + 1;
674 for (i = bits; i >= 0; i--) {
675 if (grp->bb_counters[i] > 0) {
676 grp->bb_largest_free_order = i;
677 break;
678 }
679 }
680}
681
661static noinline_for_stack 682static noinline_for_stack
662void ext4_mb_generate_buddy(struct super_block *sb, 683void ext4_mb_generate_buddy(struct super_block *sb,
663 void *buddy, void *bitmap, ext4_group_t group) 684 void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
700 */ 721 */
701 grp->bb_free = free; 722 grp->bb_free = free;
702 } 723 }
724 mb_set_largest_free_order(sb, grp);
703 725
704 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 726 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
705 727
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
725 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 747 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
726 * So it can have information regarding groups_per_page which 748 * So it can have information regarding groups_per_page which
727 * is blocks_per_page/2 749 * is blocks_per_page/2
750 *
751 * Locking note: This routine takes the block group lock of all groups
752 * for this page; do not hold this lock when calling this routine!
728 */ 753 */
729 754
730static int ext4_mb_init_cache(struct page *page, char *incore) 755static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
865 BUG_ON(incore == NULL); 890 BUG_ON(incore == NULL);
866 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 891 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
867 group, page->index, i * blocksize); 892 group, page->index, i * blocksize);
893 trace_ext4_mb_buddy_bitmap_load(sb, group);
868 grinfo = ext4_get_group_info(sb, group); 894 grinfo = ext4_get_group_info(sb, group);
869 grinfo->bb_fragments = 0; 895 grinfo->bb_fragments = 0;
870 memset(grinfo->bb_counters, 0, 896 memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
882 BUG_ON(incore != NULL); 908 BUG_ON(incore != NULL);
883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 909 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
884 group, page->index, i * blocksize); 910 group, page->index, i * blocksize);
911 trace_ext4_mb_bitmap_load(sb, group);
885 912
886 /* see comments in ext4_mb_put_pa() */ 913 /* see comments in ext4_mb_put_pa() */
887 ext4_lock_group(sb, group); 914 ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
910 return err; 937 return err;
911} 938}
912 939
940/*
941 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
942 * block group lock of all groups for this page; do not hold the BG lock when
943 * calling this routine!
944 */
913static noinline_for_stack 945static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 946int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{ 947{
@@ -1004,6 +1036,11 @@ err:
1004 return ret; 1036 return ret;
1005} 1037}
1006 1038
1039/*
1040 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1041 * block group lock of all groups for this page; do not hold the BG lock when
1042 * calling this routine!
1043 */
1007static noinline_for_stack int 1044static noinline_for_stack int
1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1045ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1009 struct ext4_buddy *e4b) 1046 struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
1150 return ret; 1187 return ret;
1151} 1188}
1152 1189
1153static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1190static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1154{ 1191{
1155 if (e4b->bd_bitmap_page) 1192 if (e4b->bd_bitmap_page)
1156 page_cache_release(e4b->bd_bitmap_page); 1193 page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1299 buddy = buddy2; 1336 buddy = buddy2;
1300 } while (1); 1337 } while (1);
1301 } 1338 }
1339 mb_set_largest_free_order(sb, e4b->bd_info);
1302 mb_check_buddy(e4b); 1340 mb_check_buddy(e4b);
1303} 1341}
1304 1342
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1427 e4b->bd_info->bb_counters[ord]++; 1465 e4b->bd_info->bb_counters[ord]++;
1428 e4b->bd_info->bb_counters[ord]++; 1466 e4b->bd_info->bb_counters[ord]++;
1429 } 1467 }
1468 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1430 1469
1431 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1470 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1432 mb_check_buddy(e4b); 1471 mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1617 } 1656 }
1618 1657
1619 ext4_unlock_group(ac->ac_sb, group); 1658 ext4_unlock_group(ac->ac_sb, group);
1620 ext4_mb_release_desc(e4b); 1659 ext4_mb_unload_buddy(e4b);
1621 1660
1622 return 0; 1661 return 0;
1623} 1662}
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1672 ext4_mb_use_best_found(ac, e4b); 1711 ext4_mb_use_best_found(ac, e4b);
1673 } 1712 }
1674 ext4_unlock_group(ac->ac_sb, group); 1713 ext4_unlock_group(ac->ac_sb, group);
1675 ext4_mb_release_desc(e4b); 1714 ext4_mb_unload_buddy(e4b);
1676 1715
1677 return 0; 1716 return 0;
1678} 1717}
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1821 } 1860 }
1822} 1861}
1823 1862
1863/* This is now called BEFORE we load the buddy bitmap. */
1824static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1864static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1825 ext4_group_t group, int cr) 1865 ext4_group_t group, int cr)
1826{ 1866{
1827 unsigned free, fragments; 1867 unsigned free, fragments;
1828 unsigned i, bits;
1829 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1868 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1830 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1869 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1831 1870
1832 BUG_ON(cr < 0 || cr >= 4); 1871 BUG_ON(cr < 0 || cr >= 4);
1833 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1872
1873 /* We only do this if the grp has never been initialized */
1874 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1875 int ret = ext4_mb_init_group(ac->ac_sb, group);
1876 if (ret)
1877 return 0;
1878 }
1834 1879
1835 free = grp->bb_free; 1880 free = grp->bb_free;
1836 fragments = grp->bb_fragments; 1881 fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1843 case 0: 1888 case 0:
1844 BUG_ON(ac->ac_2order == 0); 1889 BUG_ON(ac->ac_2order == 0);
1845 1890
1891 if (grp->bb_largest_free_order < ac->ac_2order)
1892 return 0;
1893
1846 /* Avoid using the first bg of a flexgroup for data files */ 1894 /* Avoid using the first bg of a flexgroup for data files */
1847 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1895 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1848 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1896 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1849 ((group % flex_size) == 0)) 1897 ((group % flex_size) == 0))
1850 return 0; 1898 return 0;
1851 1899
1852 bits = ac->ac_sb->s_blocksize_bits + 1; 1900 return 1;
1853 for (i = ac->ac_2order; i <= bits; i++)
1854 if (grp->bb_counters[i] > 0)
1855 return 1;
1856 break;
1857 case 1: 1901 case 1:
1858 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1902 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1859 return 1; 1903 return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1964 sbi = EXT4_SB(sb); 2008 sbi = EXT4_SB(sb);
1965 ngroups = ext4_get_groups_count(sb); 2009 ngroups = ext4_get_groups_count(sb);
1966 /* non-extent files are limited to low blocks/groups */ 2010 /* non-extent files are limited to low blocks/groups */
1967 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) 2011 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1968 ngroups = sbi->s_blockfile_groups; 2012 ngroups = sbi->s_blockfile_groups;
1969 2013
1970 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2014 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
2024 group = ac->ac_g_ex.fe_group; 2068 group = ac->ac_g_ex.fe_group;
2025 2069
2026 for (i = 0; i < ngroups; group++, i++) { 2070 for (i = 0; i < ngroups; group++, i++) {
2027 struct ext4_group_info *grp;
2028 struct ext4_group_desc *desc;
2029
2030 if (group == ngroups) 2071 if (group == ngroups)
2031 group = 0; 2072 group = 0;
2032 2073
2033 /* quick check to skip empty groups */ 2074 /* This now checks without needing the buddy page */
2034 grp = ext4_get_group_info(sb, group); 2075 if (!ext4_mb_good_group(ac, group, cr))
2035 if (grp->bb_free == 0)
2036 continue; 2076 continue;
2037 2077
2038 err = ext4_mb_load_buddy(sb, group, &e4b); 2078 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
2040 goto out; 2080 goto out;
2041 2081
2042 ext4_lock_group(sb, group); 2082 ext4_lock_group(sb, group);
2083
2084 /*
2085 * We need to check again after locking the
2086 * block group
2087 */
2043 if (!ext4_mb_good_group(ac, group, cr)) { 2088 if (!ext4_mb_good_group(ac, group, cr)) {
2044 /* someone did allocation from this group */
2045 ext4_unlock_group(sb, group); 2089 ext4_unlock_group(sb, group);
2046 ext4_mb_release_desc(&e4b); 2090 ext4_mb_unload_buddy(&e4b);
2047 continue; 2091 continue;
2048 } 2092 }
2049 2093
2050 ac->ac_groups_scanned++; 2094 ac->ac_groups_scanned++;
2051 desc = ext4_get_group_desc(sb, group, NULL);
2052 if (cr == 0) 2095 if (cr == 0)
2053 ext4_mb_simple_scan_group(ac, &e4b); 2096 ext4_mb_simple_scan_group(ac, &e4b);
2054 else if (cr == 1 && 2097 else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
2058 ext4_mb_complex_scan_group(ac, &e4b); 2101 ext4_mb_complex_scan_group(ac, &e4b);
2059 2102
2060 ext4_unlock_group(sb, group); 2103 ext4_unlock_group(sb, group);
2061 ext4_mb_release_desc(&e4b); 2104 ext4_mb_unload_buddy(&e4b);
2062 2105
2063 if (ac->ac_status != AC_STATUS_CONTINUE) 2106 if (ac->ac_status != AC_STATUS_CONTINUE)
2064 break; 2107 break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2148 ext4_lock_group(sb, group); 2191 ext4_lock_group(sb, group);
2149 memcpy(&sg, ext4_get_group_info(sb, group), i); 2192 memcpy(&sg, ext4_get_group_info(sb, group), i);
2150 ext4_unlock_group(sb, group); 2193 ext4_unlock_group(sb, group);
2151 ext4_mb_release_desc(&e4b); 2194 ext4_mb_unload_buddy(&e4b);
2152 2195
2153 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2196 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2154 sg.info.bb_fragments, sg.info.bb_first_free); 2197 sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2256 init_rwsem(&meta_group_info[i]->alloc_sem); 2299 init_rwsem(&meta_group_info[i]->alloc_sem);
2257 meta_group_info[i]->bb_free_root = RB_ROOT; 2300 meta_group_info[i]->bb_free_root = RB_ROOT;
2301 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2258 2302
2259#ifdef DOUBLE_CHECK 2303#ifdef DOUBLE_CHECK
2260 { 2304 {
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2536 entry->count, entry->group, entry); 2580 entry->count, entry->group, entry);
2537 2581
2538 if (test_opt(sb, DISCARD)) { 2582 if (test_opt(sb, DISCARD)) {
2583 int ret;
2539 ext4_fsblk_t discard_block; 2584 ext4_fsblk_t discard_block;
2540 2585
2541 discard_block = entry->start_blk + 2586 discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2543 trace_ext4_discard_blocks(sb, 2588 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block, 2589 (unsigned long long)discard_block,
2545 entry->count); 2590 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count); 2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2547 } 2597 }
2548 2598
2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2568 } 2618 }
2569 ext4_unlock_group(sb, entry->group); 2619 ext4_unlock_group(sb, entry->group);
2570 kmem_cache_free(ext4_free_ext_cachep, entry); 2620 kmem_cache_free(ext4_free_ext_cachep, entry);
2571 ext4_mb_release_desc(&e4b); 2621 ext4_mb_unload_buddy(&e4b);
2572 } 2622 }
2573 2623
2574 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2624 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
2641 2691
2642void exit_ext4_mballoc(void) 2692void exit_ext4_mballoc(void)
2643{ 2693{
2644 /* 2694 /*
2645 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2646 * before destroying the slab cache. 2696 * before destroying the slab cache.
2647 */ 2697 */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2981 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3031 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
2982 atomic_inc(&sbi->s_bal_reqs); 3032 atomic_inc(&sbi->s_bal_reqs);
2983 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3033 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
2984 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3034 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
2985 atomic_inc(&sbi->s_bal_success); 3035 atomic_inc(&sbi->s_bal_success);
2986 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3036 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
2987 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3037 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3123 continue; 3173 continue;
3124 3174
3125 /* non-extent files can't have physical blocks past 2^32 */ 3175 /* non-extent files can't have physical blocks past 2^32 */
3126 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && 3176 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3127 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3177 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3128 continue; 3178 continue;
3129 3179
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3280 spin_unlock(&pa->pa_lock); 3330 spin_unlock(&pa->pa_lock);
3281 3331
3282 grp_blk = pa->pa_pstart; 3332 grp_blk = pa->pa_pstart;
3283 /* 3333 /*
3284 * If doing group-based preallocation, pa_pstart may be in the 3334 * If doing group-based preallocation, pa_pstart may be in the
3285 * next group when pa is used up 3335 * next group when pa is used up
3286 */ 3336 */
@@ -3697,7 +3747,7 @@ out:
3697 ext4_unlock_group(sb, group); 3747 ext4_unlock_group(sb, group);
3698 if (ac) 3748 if (ac)
3699 kmem_cache_free(ext4_ac_cachep, ac); 3749 kmem_cache_free(ext4_ac_cachep, ac);
3700 ext4_mb_release_desc(&e4b); 3750 ext4_mb_unload_buddy(&e4b);
3701 put_bh(bitmap_bh); 3751 put_bh(bitmap_bh);
3702 return free; 3752 return free;
3703} 3753}
@@ -3801,7 +3851,7 @@ repeat:
3801 if (bitmap_bh == NULL) { 3851 if (bitmap_bh == NULL) {
3802 ext4_error(sb, "Error reading block bitmap for %u", 3852 ext4_error(sb, "Error reading block bitmap for %u",
3803 group); 3853 group);
3804 ext4_mb_release_desc(&e4b); 3854 ext4_mb_unload_buddy(&e4b);
3805 continue; 3855 continue;
3806 } 3856 }
3807 3857
@@ -3810,7 +3860,7 @@ repeat:
3810 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3860 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
3811 ext4_unlock_group(sb, group); 3861 ext4_unlock_group(sb, group);
3812 3862
3813 ext4_mb_release_desc(&e4b); 3863 ext4_mb_unload_buddy(&e4b);
3814 put_bh(bitmap_bh); 3864 put_bh(bitmap_bh);
3815 3865
3816 list_del(&pa->u.pa_tmp_list); 3866 list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4074 ext4_mb_release_group_pa(&e4b, pa, ac); 4124 ext4_mb_release_group_pa(&e4b, pa, ac);
4075 ext4_unlock_group(sb, group); 4125 ext4_unlock_group(sb, group);
4076 4126
4077 ext4_mb_release_desc(&e4b); 4127 ext4_mb_unload_buddy(&e4b);
4078 list_del(&pa->u.pa_tmp_list); 4128 list_del(&pa->u.pa_tmp_list);
4079 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4129 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4080 } 4130 }
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4484 if (!bh) 4534 if (!bh)
4485 tbh = sb_find_get_block(inode->i_sb, 4535 tbh = sb_find_get_block(inode->i_sb,
4486 block + i); 4536 block + i);
4487 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4537 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4488 inode, tbh, block + i); 4538 inode, tbh, block + i);
4489 } 4539 }
4490 } 4540 }
4491 4541
4492 /* 4542 /*
4493 * We need to make sure we don't reuse the freed block until 4543 * We need to make sure we don't reuse the freed block until
4494 * after the transaction is committed, which we can do by 4544 * after the transaction is committed, which we can do by
4495 * treating the block as metadata, below. We make an 4545 * treating the block as metadata, below. We make an
@@ -4610,7 +4660,7 @@ do_more:
4610 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4660 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4611 } 4661 }
4612 4662
4613 ext4_mb_release_desc(&e4b); 4663 ext4_mb_unload_buddy(&e4b);
4614 4664
4615 freed += count; 4665 freed += count;
4616 4666
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
475 */ 475 */
476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
477 EXT4_FEATURE_INCOMPAT_EXTENTS) || 477 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
478 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 478 (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
479 return -EINVAL; 479 return -EINVAL;
480 480
481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..3a6c92ac131c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
482 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
483 int ret; 483 int ret;
484 484
485 start_ext.ee_block = end_ext.ee_block = 0;
485 o_start = o_end = oext = orig_path[depth].p_ext; 486 o_start = o_end = oext = orig_path[depth].p_ext;
486 oext_alen = ext4_ext_get_actual_len(oext); 487 oext_alen = ext4_ext_get_actual_len(oext);
487 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
529 * new_ext |-------| 530 * new_ext |-------|
530 */ 531 */
531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 532 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
532 ext4_error(orig_inode->i_sb, 533 EXT4_ERROR_INODE(orig_inode,
533 "new_ext_end(%u) should be less than or equal to " 534 "new_ext_end(%u) should be less than or equal to "
534 "oext->ee_block(%u) + oext_alen(%d) - 1", 535 "oext->ee_block(%u) + oext_alen(%d) - 1",
535 new_ext_end, le32_to_cpu(oext->ee_block), 536 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 while (1) { 693 while (1) {
693 /* The extent for donor must be found. */ 694 /* The extent for donor must be found. */
694 if (!dext) { 695 if (!dext) {
695 ext4_error(donor_inode->i_sb, 696 EXT4_ERROR_INODE(donor_inode,
696 "The extent for donor must be found"); 697 "The extent for donor must be found");
697 *err = -EIO; 698 *err = -EIO;
698 goto out; 699 goto out;
699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 700 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
700 ext4_error(donor_inode->i_sb, 701 EXT4_ERROR_INODE(donor_inode,
701 "Donor offset(%u) and the first block of donor " 702 "Donor offset(%u) and the first block of donor "
702 "extent(%u) should be equal", 703 "extent(%u) should be equal",
703 donor_off, 704 donor_off,
@@ -976,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
976 } 977 }
977 978
978 /* Ext4 move extent supports only extent based file */ 979 /* Ext4 move extent supports only extent based file */
979 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 980 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
980 ext4_debug("ext4 move extent: orig file is not extents " 981 ext4_debug("ext4 move extent: orig file is not extents "
981 "based file [ino:orig %lu]\n", orig_inode->i_ino); 982 "based file [ino:orig %lu]\n", orig_inode->i_ino);
982 return -EOPNOTSUPP; 983 return -EOPNOTSUPP;
983 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { 984 } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
984 ext4_debug("ext4 move extent: donor file is not extents " 985 ext4_debug("ext4 move extent: donor file is not extents "
985 "based file [ino:donor %lu]\n", donor_inode->i_ino); 986 "based file [ino:donor %lu]\n", donor_inode->i_ino);
986 return -EOPNOTSUPP; 987 return -EOPNOTSUPP;
@@ -1354,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1354 if (ret1 < 0) 1355 if (ret1 < 0)
1355 break; 1356 break;
1356 if (*moved_len > len) { 1357 if (*moved_len > len) {
1357 ext4_error(orig_inode->i_sb, 1358 EXT4_ERROR_INODE(orig_inode,
1358 "We replaced blocks too much! " 1359 "We replaced blocks too much! "
1359 "sum of replaced: %llu requested: %llu", 1360 "sum of replaced: %llu requested: %llu",
1360 *moved_len, len); 1361 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
187 return blocksize; 187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16); 188 return (len & 65532) | ((len & 3) << 16);
189} 189}
190 190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) 191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{ 192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) 193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
197 if (len == blocksize) { 197 if (len == blocksize) {
198 if (blocksize == 65536) 198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN); 199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else 200 else
201 return cpu_to_le16(0); 201 return cpu_to_le16(0);
202 } 202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); 203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
349 brelse(bh); 349 brelse(bh);
350 } 350 }
351 if (bcount) 351 if (bcount)
352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
353 levels ? "" : " ", names, space/bcount, 353 levels ? "" : " ", names, space/bcount,
354 (space/bcount)*100/blocksize); 354 (space/bcount)*100/blocksize);
355 return (struct stats) { names, space, bcount}; 355 return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
653 int ret, err; 653 int ret, err;
654 __u32 hashval; 654 __u32 hashval;
655 655
656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
657 start_hash, start_minor_hash)); 657 start_hash, start_minor_hash));
658 dir = dir_file->f_path.dentry->d_inode; 658 dir = dir_file->f_path.dentry->d_inode;
659 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 659 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
661 if (hinfo.hash_version <= DX_HASH_TEA) 661 if (hinfo.hash_version <= DX_HASH_TEA)
662 hinfo.hash_version += 662 hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
801{ 801{
802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
803 EXT4_FEATURE_COMPAT_DIR_INDEX)) 803 EXT4_FEATURE_COMPAT_DIR_INDEX))
804 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; 804 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
805} 805}
806 806
807/* 807/*
@@ -943,8 +943,8 @@ restart:
943 wait_on_buffer(bh); 943 wait_on_buffer(bh);
944 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
945 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
946 ext4_error(sb, "reading directory #%lu offset %lu", 946 EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
947 dir->i_ino, (unsigned long)block); 947 (unsigned long) block);
948 brelse(bh); 948 brelse(bh);
949 goto next; 949 goto next;
950 } 950 }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1066 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1067 brelse(bh); 1067 brelse(bh);
1068 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1069 ext4_error(dir->i_sb, "bad inode number: %u", ino); 1069 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1070 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1071 } 1071 }
1072 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1073 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1074 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1075 ext4_error(dir->i_sb, 1075 EXT4_ERROR_INODE(dir,
1076 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1077 ino); 1077 ino);
1078 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
1079 } else { 1079 } else {
1080 return ERR_CAST(inode); 1080 return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1104 brelse(bh); 1104 brelse(bh);
1105 1105
1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1107 ext4_error(child->d_inode->i_sb, 1107 EXT4_ERROR_INODE(child->d_inode,
1108 "bad inode number: %u", ino); 1108 "bad parent inode number: %u", ino);
1109 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1110 } 1110 }
1111 1111
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1141 unsigned rec_len = 0; 1141 unsigned rec_len = 0;
1142 1142
1143 while (count--) { 1143 while (count--) {
1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1145 (from + (map->offs<<2)); 1145 (from + (map->offs<<2));
1146 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1146 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1147 memcpy (to, de, rec_len); 1147 memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1405 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1406 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1407 ext4_error(dir->i_sb, 1407 EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1408 "invalid rec_len for '..' in inode %lu",
1409 dir->i_ino);
1410 brelse(bh); 1408 brelse(bh);
1411 return -EIO; 1409 return -EIO;
1412 } 1410 }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1418 brelse(bh); 1416 brelse(bh);
1419 return retval; 1417 return retval;
1420 } 1418 }
1421 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1419 ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1422 data1 = bh2->b_data; 1420 data1 = bh2->b_data;
1423 1421
1424 memcpy (data1, de, len); 1422 memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1491 retval = ext4_dx_add_entry(handle, dentry, inode); 1489 retval = ext4_dx_add_entry(handle, dentry, inode);
1492 if (!retval || (retval != ERR_BAD_DX_DIR)) 1490 if (!retval || (retval != ERR_BAD_DX_DIR))
1493 return retval; 1491 return retval;
1494 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; 1492 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1495 dx_fallback++; 1493 dx_fallback++;
1496 ext4_mark_inode_dirty(handle, dir); 1494 ext4_mark_inode_dirty(handle, dir);
1497 } 1495 }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1517 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1518 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh); 1519 brelse(bh);
1520 if (retval == 0)
1521 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1522 return retval; 1522 return retval;
1523} 1523}
1524 1524
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1917 if (err) 1917 if (err)
1918 ext4_error(inode->i_sb, 1918 EXT4_ERROR_INODE(inode,
1919 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory lblock 0", err);
1920 err, inode->i_ino);
1921 else 1920 else
1922 ext4_warning(inode->i_sb, 1921 ext4_warning(inode->i_sb,
1923 "bad directory (dir #%lu) - no data block", 1922 "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
1941 de = ext4_next_entry(de1, sb->s_blocksize); 1940 de = ext4_next_entry(de1, sb->s_blocksize);
1942 while (offset < inode->i_size) { 1941 while (offset < inode->i_size) {
1943 if (!bh || 1942 if (!bh ||
1944 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1943 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1944 unsigned int lblock;
1945 err = 0; 1945 err = 0;
1946 brelse(bh); 1946 brelse(bh);
1947 bh = ext4_bread(NULL, inode, 1947 lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 bh = ext4_bread(NULL, inode, lblock, 0, &err);
1949 if (!bh) { 1949 if (!bh) {
1950 if (err) 1950 if (err)
1951 ext4_error(sb, 1951 EXT4_ERROR_INODE(inode,
1952 "error %d reading directory" 1952 "error %d reading directory "
1953 " #%lu offset %u", 1953 "lblock %u", err, lblock);
1954 err, inode->i_ino, offset);
1955 offset += sb->s_blocksize; 1954 offset += sb->s_blocksize;
1956 continue; 1955 continue;
1957 } 1956 }
@@ -2297,7 +2296,7 @@ retry:
2297 } 2296 }
2298 } else { 2297 } else {
2299 /* clear the extent format for fast symlink */ 2298 /* clear the extent format for fast symlink */
2300 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2299 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2301 inode->i_op = &ext4_fast_symlink_inode_operations; 2300 inode->i_op = &ext4_fast_symlink_inode_operations;
2302 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 2301 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2303 inode->i_size = l-1; 2302 inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
911 percpu_counter_add(&sbi->s_freeinodes_counter, 911 percpu_counter_add(&sbi->s_freeinodes_counter,
912 EXT4_INODES_PER_GROUP(sb)); 912 EXT4_INODES_PER_GROUP(sb));
913 913
914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
915 sbi->s_log_groups_per_flex) {
915 ext4_group_t flex_group; 916 ext4_group_t flex_group;
916 flex_group = ext4_flex_group(sbi, input->group); 917 flex_group = ext4_flex_group(sbi, input->group);
917 atomic_add(input->free_blocks_count, 918 atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..49d88c0597c4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
243 243
244 vfs_check_frozen(sb, SB_FREEZE_WRITE);
244 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
245 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
246 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
@@ -941,6 +942,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
941 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 942 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
942 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 943 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
943 seq_puts(seq, ",journal_async_commit"); 944 seq_puts(seq, ",journal_async_commit");
945 else if (test_opt(sb, JOURNAL_CHECKSUM))
946 seq_puts(seq, ",journal_checksum");
944 if (test_opt(sb, NOBH)) 947 if (test_opt(sb, NOBH))
945 seq_puts(seq, ",nobh"); 948 seq_puts(seq, ",nobh");
946 if (test_opt(sb, I_VERSION)) 949 if (test_opt(sb, I_VERSION))
@@ -2213,7 +2216,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2213struct ext4_attr { 2216struct ext4_attr {
2214 struct attribute attr; 2217 struct attribute attr;
2215 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2218 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2216 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2219 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2217 const char *, size_t); 2220 const char *, size_t);
2218 int offset; 2221 int offset;
2219}; 2222};
@@ -2430,6 +2433,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2430 __releases(kernel_lock) 2433 __releases(kernel_lock)
2431 __acquires(kernel_lock) 2434 __acquires(kernel_lock)
2432{ 2435{
2436 char *orig_data = kstrdup(data, GFP_KERNEL);
2433 struct buffer_head *bh; 2437 struct buffer_head *bh;
2434 struct ext4_super_block *es = NULL; 2438 struct ext4_super_block *es = NULL;
2435 struct ext4_sb_info *sbi; 2439 struct ext4_sb_info *sbi;
@@ -2793,24 +2797,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2793 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2797 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2794 spin_lock_init(&sbi->s_next_gen_lock); 2798 spin_lock_init(&sbi->s_next_gen_lock);
2795 2799
2796 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2797 ext4_count_free_blocks(sb));
2798 if (!err) {
2799 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2800 ext4_count_free_inodes(sb));
2801 }
2802 if (!err) {
2803 err = percpu_counter_init(&sbi->s_dirs_counter,
2804 ext4_count_dirs(sb));
2805 }
2806 if (!err) {
2807 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2808 }
2809 if (err) {
2810 ext4_msg(sb, KERN_ERR, "insufficient memory");
2811 goto failed_mount3;
2812 }
2813
2814 sbi->s_stripe = ext4_get_stripe_size(sbi); 2800 sbi->s_stripe = ext4_get_stripe_size(sbi);
2815 sbi->s_max_writeback_mb_bump = 128; 2801 sbi->s_max_writeback_mb_bump = 128;
2816 2802
@@ -2910,6 +2896,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2896 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2911 2897
2912no_journal: 2898no_journal:
2899 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2900 ext4_count_free_blocks(sb));
2901 if (!err)
2902 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2903 ext4_count_free_inodes(sb));
2904 if (!err)
2905 err = percpu_counter_init(&sbi->s_dirs_counter,
2906 ext4_count_dirs(sb));
2907 if (!err)
2908 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2909 if (err) {
2910 ext4_msg(sb, KERN_ERR, "insufficient memory");
2911 goto failed_mount_wq;
2912 }
2913 if (test_opt(sb, NOBH)) { 2913 if (test_opt(sb, NOBH)) {
2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3001,7 @@ no_journal:
3001 err = ext4_setup_system_zone(sb); 3001 err = ext4_setup_system_zone(sb);
3002 if (err) { 3002 if (err) {
3003 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3003 ext4_msg(sb, KERN_ERR, "failed to initialize system "
3004 "zone (%d)\n", err); 3004 "zone (%d)", err);
3005 goto failed_mount4; 3005 goto failed_mount4;
3006 } 3006 }
3007 3007
@@ -3040,9 +3040,11 @@ no_journal:
3040 } else 3040 } else
3041 descr = "out journal"; 3041 descr = "out journal";
3042 3042
3043 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); 3043 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3044 "Opts: %s", descr, orig_data);
3044 3045
3045 lock_kernel(); 3046 lock_kernel();
3047 kfree(orig_data);
3046 return 0; 3048 return 0;
3047 3049
3048cantfind_ext4: 3050cantfind_ext4:
@@ -3059,6 +3061,10 @@ failed_mount_wq:
3059 jbd2_journal_destroy(sbi->s_journal); 3061 jbd2_journal_destroy(sbi->s_journal);
3060 sbi->s_journal = NULL; 3062 sbi->s_journal = NULL;
3061 } 3063 }
3064 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3065 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3066 percpu_counter_destroy(&sbi->s_dirs_counter);
3067 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3062failed_mount3: 3068failed_mount3:
3063 if (sbi->s_flex_groups) { 3069 if (sbi->s_flex_groups) {
3064 if (is_vmalloc_addr(sbi->s_flex_groups)) 3070 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3072,6 @@ failed_mount3:
3066 else 3072 else
3067 kfree(sbi->s_flex_groups); 3073 kfree(sbi->s_flex_groups);
3068 } 3074 }
3069 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3070 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3071 percpu_counter_destroy(&sbi->s_dirs_counter);
3072 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3073failed_mount2: 3075failed_mount2:
3074 for (i = 0; i < db_count; i++) 3076 for (i = 0; i < db_count; i++)
3075 brelse(sbi->s_group_desc[i]); 3077 brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3091,7 @@ out_fail:
3089 kfree(sbi->s_blockgroup_lock); 3091 kfree(sbi->s_blockgroup_lock);
3090 kfree(sbi); 3092 kfree(sbi);
3091 lock_kernel(); 3093 lock_kernel();
3094 kfree(orig_data);
3092 return ret; 3095 return ret;
3093} 3096}
3094 3097
@@ -3380,7 +3383,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3380 if (!(sb->s_flags & MS_RDONLY)) 3383 if (!(sb->s_flags & MS_RDONLY))
3381 es->s_wtime = cpu_to_le32(get_seconds()); 3384 es->s_wtime = cpu_to_le32(get_seconds());
3382 es->s_kbytes_written = 3385 es->s_kbytes_written =
3383 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3386 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3384 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3387 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3385 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3388 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3386 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3389 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3488,10 @@ int ext4_force_commit(struct super_block *sb)
3485 return 0; 3488 return 0;
3486 3489
3487 journal = EXT4_SB(sb)->s_journal; 3490 journal = EXT4_SB(sb)->s_journal;
3488 if (journal) 3491 if (journal) {
3492 vfs_check_frozen(sb, SB_FREEZE_WRITE);
3489 ret = ext4_journal_force_commit(journal); 3493 ret = ext4_journal_force_commit(journal);
3494 }
3490 3495
3491 return ret; 3496 return ret;
3492} 3497}
@@ -3535,18 +3540,16 @@ static int ext4_freeze(struct super_block *sb)
3535 * the journal. 3540 * the journal.
3536 */ 3541 */
3537 error = jbd2_journal_flush(journal); 3542 error = jbd2_journal_flush(journal);
3538 if (error < 0) { 3543 if (error < 0)
3539 out: 3544 goto out;
3540 jbd2_journal_unlock_updates(journal);
3541 return error;
3542 }
3543 3545
3544 /* Journal blocked and flushed, clear needs_recovery flag. */ 3546 /* Journal blocked and flushed, clear needs_recovery flag. */
3545 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3547 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3546 error = ext4_commit_super(sb, 1); 3548 error = ext4_commit_super(sb, 1);
3547 if (error) 3549out:
3548 goto out; 3550 /* we rely on s_frozen to stop further updates */
3549 return 0; 3551 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3552 return error;
3550} 3553}
3551 3554
3552/* 3555/*
@@ -3563,7 +3566,6 @@ static int ext4_unfreeze(struct super_block *sb)
3563 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3566 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3564 ext4_commit_super(sb, 1); 3567 ext4_commit_super(sb, 1);
3565 unlock_super(sb); 3568 unlock_super(sb);
3566 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3567 return 0; 3569 return 0;
3568} 3570}
3569 3571
@@ -3580,6 +3582,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3580#ifdef CONFIG_QUOTA 3582#ifdef CONFIG_QUOTA
3581 int i; 3583 int i;
3582#endif 3584#endif
3585 char *orig_data = kstrdup(data, GFP_KERNEL);
3583 3586
3584 lock_kernel(); 3587 lock_kernel();
3585 3588
@@ -3713,6 +3716,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3713#endif 3716#endif
3714 unlock_super(sb); 3717 unlock_super(sb);
3715 unlock_kernel(); 3718 unlock_kernel();
3719
3720 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
3721 kfree(orig_data);
3716 return 0; 3722 return 0;
3717 3723
3718restore_opts: 3724restore_opts:
@@ -3734,6 +3740,7 @@ restore_opts:
3734#endif 3740#endif
3735 unlock_super(sb); 3741 unlock_super(sb);
3736 unlock_kernel(); 3742 unlock_kernel();
3743 kfree(orig_data);
3737 return err; 3744 return err;
3738} 3745}
3739 3746
@@ -4141,6 +4148,7 @@ static int __init init_ext4_fs(void)
4141{ 4148{
4142 int err; 4149 int err;
4143 4150
4151 ext4_check_flag_values();
4144 err = init_ext4_system_zone(); 4152 err = init_ext4_system_zone();
4145 if (err) 4153 if (err)
4146 return err; 4154 return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr,
37#ifdef CONFIG_EXT4_FS_XATTR 38#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 46const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 48 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr,
48#ifdef CONFIG_EXT4_FS_XATTR 50#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: 230bad_block:
231 ext4_error(inode->i_sb, 231 EXT4_ERROR_INODE(inode, "bad block %llu",
232 "inode %lu: bad block %llu", inode->i_ino, 232 EXT4_I(inode)->i_file_acl);
233 EXT4_I(inode)->i_file_acl);
234 error = -EIO; 233 error = -EIO;
235 goto cleanup; 234 goto cleanup;
236 } 235 }
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
372 ea_bdebug(bh, "b_count=%d, refcount=%d", 371 ea_bdebug(bh, "b_count=%d, refcount=%d",
373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
374 if (ext4_xattr_check_block(bh)) { 373 if (ext4_xattr_check_block(bh)) {
375 ext4_error(inode->i_sb, 374 EXT4_ERROR_INODE(inode, "bad block %llu",
376 "inode %lu: bad block %llu", inode->i_ino, 375 EXT4_I(inode)->i_file_acl);
377 EXT4_I(inode)->i_file_acl);
378 error = -EIO; 376 error = -EIO;
379 goto cleanup; 377 goto cleanup;
380 } 378 }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
666 atomic_read(&(bs->bh->b_count)), 664 atomic_read(&(bs->bh->b_count)),
667 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 665 le32_to_cpu(BHDR(bs->bh)->h_refcount));
668 if (ext4_xattr_check_block(bs->bh)) { 666 if (ext4_xattr_check_block(bs->bh)) {
669 ext4_error(sb, "inode %lu: bad block %llu", 667 EXT4_ERROR_INODE(inode, "bad block %llu",
670 inode->i_ino, EXT4_I(inode)->i_file_acl); 668 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 669 error = -EIO;
672 goto cleanup; 670 goto cleanup;
673 } 671 }
@@ -820,7 +818,7 @@ inserted:
820 EXT4_I(inode)->i_block_group); 818 EXT4_I(inode)->i_block_group);
821 819
822 /* non-extent files can't have physical blocks past 2^32 */ 820 /* non-extent files can't have physical blocks past 2^32 */
823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 821 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
824 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 822 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
825 823
826 block = ext4_new_meta_blocks(handle, inode, 824 block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
828 if (error) 826 if (error)
829 goto cleanup; 827 goto cleanup;
830 828
831 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 829 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
832 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 830 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
833 831
834 ea_idebug(inode, "creating block %d", block); 832 ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
880 goto cleanup; 878 goto cleanup;
881 879
882bad_block: 880bad_block:
883 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 881 EXT4_ERROR_INODE(inode, "bad block %llu",
884 inode->i_ino, EXT4_I(inode)->i_file_acl); 882 EXT4_I(inode)->i_file_acl);
885 goto cleanup; 883 goto cleanup;
886 884
887#undef header 885#undef header
@@ -1194,8 +1192,8 @@ retry:
1194 if (!bh) 1192 if (!bh)
1195 goto cleanup; 1193 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1194 if (ext4_xattr_check_block(bh)) {
1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1195 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 inode->i_ino, EXT4_I(inode)->i_file_acl); 1196 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1197 error = -EIO;
1200 goto cleanup; 1198 goto cleanup;
1201 } 1199 }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1370 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1371 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1372 if (!bh) {
1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error", 1373 EXT4_ERROR_INODE(inode, "block %llu read error",
1376 inode->i_ino, EXT4_I(inode)->i_file_acl); 1374 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1375 goto cleanup;
1378 } 1376 }
1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1377 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1378 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1379 EXT4_ERROR_INODE(inode, "bad block %llu",
1382 inode->i_ino, EXT4_I(inode)->i_file_acl); 1380 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1381 goto cleanup;
1384 } 1382 }
1385 ext4_xattr_release_block(handle, inode, bh); 1383 ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
1504 } 1502 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1503 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1504 if (!bh) {
1507 ext4_error(inode->i_sb, 1505 EXT4_ERROR_INODE(inode, "block %lu read error",
1508 "inode %lu: block %lu read error", 1506 (unsigned long) ce->e_block);
1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1507 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1511 EXT4_XATTR_REFCOUNT_MAX) { 1508 EXT4_XATTR_REFCOUNT_MAX) {
1512 ea_idebug(inode, "block %lu refcount %d>=%d", 1509 ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
57}; 57};
58 58
59const struct file_operations vxfs_dir_operations = { 59const struct file_operations vxfs_dir_operations = {
60 .llseek = generic_file_llseek,
61 .read = generic_read_dir,
60 .readdir = vxfs_readdir, 62 .readdir = vxfs_readdir,
61}; 63};
62 64
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
103 /* banners (can't represent line 0 by pos 0 as that would involve 103 /* banners (can't represent line 0 by pos 0 as that would involve
104 * returning a NULL pointer) */ 104 * returning a NULL pointer) */
105 if (pos == 0) 105 if (pos == 0)
106 return (struct fscache_object *) ++(*_pos); 106 return (struct fscache_object *)(long)++(*_pos);
107 if (pos < 3) 107 if (pos < 3)
108 return (struct fscache_object *)pos; 108 return (struct fscache_object *)pos;
109 109
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..e53df5ebb2b8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19 19
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 20MODULE_ALIAS_MISCDEV(FUSE_MINOR);
21MODULE_ALIAS("devname:fuse");
21 22
22static struct kmem_cache *fuse_req_cachep; 23static struct kmem_cache *fuse_req_cachep;
23 24
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
272 272
273const struct file_operations isofs_dir_operations = 273const struct file_operations isofs_dir_operations =
274{ 274{
275 .llseek = generic_file_llseek,
275 .read = generic_read_dir, 276 .read = generic_read_dir,
276 .readdir = isofs_readdir, 277 .readdir = isofs_readdir,
277}; 278};
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
1311 if (handle->h_sync) 1311 if (handle->h_sync)
1312 transaction->t_synchronous_commit = 1; 1312 transaction->t_synchronous_commit = 1;
1313 current->journal_info = NULL; 1313 current->journal_info = NULL;
1314 spin_lock(&journal->j_state_lock);
1315 spin_lock(&transaction->t_handle_lock); 1314 spin_lock(&transaction->t_handle_lock);
1316 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1315 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1317 transaction->t_updates--; 1316 transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
1340 jbd_debug(2, "transaction too old, requesting commit for " 1339 jbd_debug(2, "transaction too old, requesting commit for "
1341 "handle %p\n", handle); 1340 "handle %p\n", handle);
1342 /* This is non-blocking */ 1341 /* This is non-blocking */
1343 __jbd2_log_start_commit(journal, transaction->t_tid); 1342 jbd2_log_start_commit(journal, transaction->t_tid);
1344 spin_unlock(&journal->j_state_lock);
1345 1343
1346 /* 1344 /*
1347 * Special case: JBD2_SYNC synchronous updates require us 1345 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
1351 err = jbd2_log_wait_commit(journal, tid); 1349 err = jbd2_log_wait_commit(journal, tid);
1352 } else { 1350 } else {
1353 spin_unlock(&transaction->t_handle_lock); 1351 spin_unlock(&transaction->t_handle_lock);
1354 spin_unlock(&journal->j_state_lock);
1355 } 1352 }
1356 1353
1357 lock_map_release(&handle->h_lockdep_map); 1354 lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 92dde6f8d893..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,6 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
49 49
50const struct file_operations ncp_dir_operations = 50const struct file_operations ncp_dir_operations =
51{ 51{
52 .llseek = generic_file_llseek,
52 .read = generic_read_dir, 53 .read = generic_read_dir,
53 .readdir = ncp_readdir, 54 .readdir = ncp_readdir,
54 .unlocked_ioctl = ncp_ioctl, 55 .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..db64854b7b09 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1741,6 +1741,7 @@ remove_lru_entry:
1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1742 smp_mb__after_clear_bit(); 1742 smp_mb__after_clear_bit();
1743 } 1743 }
1744 spin_unlock(&inode->i_lock);
1744 } 1745 }
1745 spin_unlock(&nfs_access_lru_lock); 1746 spin_unlock(&nfs_access_lru_lock);
1746 nfs_access_free_list(&head); 1747 nfs_access_free_list(&head);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
1386 int res = 0; 1386 int res = 0;
1387 1387
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out; 1389 goto out_mark_dirty;
1390 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1391 res = nfs_scan_commit(inode, &head, 0, 0); 1391 res = nfs_scan_commit(inode, &head, 0, 0);
1392 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, 1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable, 1399 nfs_wait_bit_killable,
1400 TASK_KILLABLE); 1400 TASK_KILLABLE);
1401 else
1402 goto out_mark_dirty;
1401 } else 1403 } else
1402 nfs_commit_clear_lock(NFS_I(inode)); 1404 nfs_commit_clear_lock(NFS_I(inode));
1403out: 1405 return res;
1406 /* Note: If we exit without ensuring that the commit is complete,
1407 * we must mark the inode as dirty. Otherwise, future calls to
1408 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1409 * that the data is on the disk.
1410 */
1411out_mark_dirty:
1412 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1404 return res; 1413 return res;
1405} 1414}
1406 1415
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1509 }; 1518 };
1510 int ret; 1519 int ret;
1511 1520
1512 while(PagePrivate(page)) { 1521 for (;;) {
1513 wait_on_page_writeback(page); 1522 wait_on_page_writeback(page);
1514 if (clear_page_dirty_for_io(page)) { 1523 if (clear_page_dirty_for_io(page)) {
1515 ret = nfs_writepage_locked(page, &wbc); 1524 ret = nfs_writepage_locked(page, &wbc);
1516 if (ret < 0) 1525 if (ret < 0)
1517 goto out_error; 1526 goto out_error;
1527 continue;
1518 } 1528 }
1519 ret = sync_inode(inode, &wbc); 1529 if (!PagePrivate(page))
1530 break;
1531 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1520 if (ret < 0) 1532 if (ret < 0)
1521 goto out_error; 1533 goto out_error;
1522 } 1534 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
267 shpending = p->signal->shared_pending.signal; 267 shpending = p->signal->shared_pending.signal;
268 blocked = p->blocked; 268 blocked = p->blocked;
269 collect_sigign_sigcatch(p, &ignored, &caught); 269 collect_sigign_sigcatch(p, &ignored, &caught);
270 num_threads = atomic_read(&p->signal->count); 270 num_threads = get_nr_threads(p);
271 rcu_read_lock(); /* FIXME: is this correct? */ 271 rcu_read_lock(); /* FIXME: is this correct? */
272 qsize = atomic_read(&__task_cred(p)->user->sigpending); 272 qsize = atomic_read(&__task_cred(p)->user->sigpending);
273 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
410 tty_nr = new_encode_dev(tty_devnum(sig->tty)); 410 tty_nr = new_encode_dev(tty_devnum(sig->tty));
411 } 411 }
412 412
413 num_threads = atomic_read(&sig->count); 413 num_threads = get_nr_threads(task);
414 collect_sigign_sigcatch(task, &sigign, &sigcatch); 414 collect_sigign_sigcatch(task, &sigign, &sigcatch);
415 415
416 cmin_flt = sig->cmin_flt; 416 cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
166 return result; 166 return result;
167} 167}
168 168
169static int get_nr_threads(struct task_struct *tsk)
170{
171 unsigned long flags;
172 int count = 0;
173
174 if (lock_task_sighand(tsk, &flags)) {
175 count = atomic_read(&tsk->signal->count);
176 unlock_task_sighand(tsk, &flags);
177 }
178 return count;
179}
180
181static int proc_cwd_link(struct inode *inode, struct path *path) 169static int proc_cwd_link(struct inode *inode, struct path *path)
182{ 170{
183 struct task_struct *task = get_proc_task(inode); 171 struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2444 const struct pid_entry *p = ptr; 2432 const struct pid_entry *p = ptr;
2445 struct inode *inode; 2433 struct inode *inode;
2446 struct proc_inode *ei; 2434 struct proc_inode *ei;
2447 struct dentry *error = ERR_PTR(-EINVAL); 2435 struct dentry *error;
2448 2436
2449 /* Allocate the inode */ 2437 /* Allocate the inode */
2450 error = ERR_PTR(-ENOMEM); 2438 error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
2794 2782
2795struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2783struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2796{ 2784{
2797 struct dentry *result = ERR_PTR(-ENOENT); 2785 struct dentry *result;
2798 struct task_struct *task; 2786 struct task_struct *task;
2799 unsigned tgid; 2787 unsigned tgid;
2800 struct pid_namespace *ns; 2788 struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
343/* 343/*
344 * Return an inode number between PROC_DYNAMIC_FIRST and 344 * Return an inode number between PROC_DYNAMIC_FIRST and
345 * 0xffffffff, or zero on failure. 345 * 0xffffffff, or zero on failure.
346 *
347 * Current inode allocations in the proc-fs (hex-numbers):
348 *
349 * 00000000 reserved
350 * 00000001-00000fff static entries (goners)
351 * 001 root-ino
352 *
353 * 00001000-00001fff unused
354 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
355 * 80000000-efffffff unused
356 * f0000000-ffffffff dynamic entries
357 *
358 * Goal:
359 * Once we split the thing into several virtual filesystems,
360 * we will get rid of magical ranges (and this comment, BTW).
361 */ 346 */
362static unsigned int get_inode_number(void) 347static unsigned int get_inode_number(void)
363{ 348{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
588 */ 588 */
589static void __init proc_kcore_text_init(void) 589static void __init proc_kcore_text_init(void)
590{ 590{
591 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); 591 kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
592} 592}
593#else 593#else
594static void __init proc_kcore_text_init(void) 594static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
110 if (err) 110 if (err)
111 return; 111 return;
112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
113 err = PTR_ERR(proc_mnt);
114 if (IS_ERR(proc_mnt)) { 113 if (IS_ERR(proc_mnt)) {
115 unregister_filesystem(&proc_fs_type); 114 unregister_filesystem(&proc_fs_type);
116 return; 115 return;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..3d3fd4692133 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,6 +77,7 @@ out:
77 77
78const struct file_operations qnx4_dir_operations = 78const struct file_operations qnx4_dir_operations =
79{ 79{
80 .llseek = generic_file_llseek,
80 .read = generic_read_dir, 81 .read = generic_read_dir,
81 .readdir = qnx4_readdir, 82 .readdir = qnx4_readdir,
82 .fsync = simple_fsync, 83 .fsync = simple_fsync,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..1ad8bf076cfc 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1514,11 +1514,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1514/* 1514/*
1515 * This operation can block, but only after everything is updated 1515 * This operation can block, but only after everything is updated
1516 */ 1516 */
1517int __dquot_alloc_space(struct inode *inode, qsize_t number, 1517int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1518 int warn, int reserve)
1519{ 1518{
1520 int cnt, ret = 0; 1519 int cnt, ret = 0;
1521 char warntype[MAXQUOTAS]; 1520 char warntype[MAXQUOTAS];
1521 int warn = flags & DQUOT_SPACE_WARN;
1522 int reserve = flags & DQUOT_SPACE_RESERVE;
1523 int nofail = flags & DQUOT_SPACE_NOFAIL;
1522 1524
1523 /* 1525 /*
1524 * First test before acquiring mutex - solves deadlocks when we 1526 * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1541,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1539 continue; 1541 continue;
1540 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1542 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1541 warntype+cnt); 1543 warntype+cnt);
1542 if (ret) { 1544 if (ret && !nofail) {
1543 spin_unlock(&dq_data_lock); 1545 spin_unlock(&dq_data_lock);
1544 goto out_flush_warn; 1546 goto out_flush_warn;
1545 } 1547 }
@@ -1638,10 +1640,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1638/* 1640/*
1639 * This operation can block, but only after everything is updated 1641 * This operation can block, but only after everything is updated
1640 */ 1642 */
1641void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1643void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1642{ 1644{
1643 unsigned int cnt; 1645 unsigned int cnt;
1644 char warntype[MAXQUOTAS]; 1646 char warntype[MAXQUOTAS];
1647 int reserve = flags & DQUOT_SPACE_RESERVE;
1645 1648
1646 /* First test before acquiring mutex - solves deadlocks when we 1649 /* First test before acquiring mutex - solves deadlocks when we
1647 * re-enter the quota code and are already holding the mutex */ 1650 * re-enter the quota code and are already holding the mutex */
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
97} 97}
98EXPORT_SYMBOL(generic_file_llseek); 98EXPORT_SYMBOL(generic_file_llseek);
99 99
100/**
101 * noop_llseek - No Operation Performed llseek implementation
102 * @file: file structure to seek on
103 * @offset: file offset to seek to
104 * @origin: type of seek
105 *
106 * This is an implementation of ->llseek useable for the rare special case when
107 * userspace expects the seek to succeed but the (device) file is actually not
108 * able to perform the seek. In this case you use noop_llseek() instead of
109 * falling back to the default implementation of ->llseek.
110 */
111loff_t noop_llseek(struct file *file, loff_t offset, int origin)
112{
113 return file->f_pos;
114}
115EXPORT_SYMBOL(noop_llseek);
116
100loff_t no_llseek(struct file *file, loff_t offset, int origin) 117loff_t no_llseek(struct file *file, loff_t offset, int origin)
101{ 118{
102 return -ESPIPE; 119 return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..4455fbe269a3 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -18,6 +18,7 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
18 int datasync); 18 int datasync);
19 19
20const struct file_operations reiserfs_dir_operations = { 20const struct file_operations reiserfs_dir_operations = {
21 .llseek = generic_file_llseek,
21 .read = generic_read_dir, 22 .read = generic_read_dir,
22 .readdir = reiserfs_readdir, 23 .readdir = reiserfs_readdir,
23 .fsync = reiserfs_dir_fsync, 24 .fsync = reiserfs_dir_fsync,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 6c978428892d..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,6 +37,7 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
37 37
38const struct file_operations smb_dir_operations = 38const struct file_operations smb_dir_operations =
39{ 39{
40 .llseek = generic_file_llseek,
40 .read = generic_read_dir, 41 .read = generic_read_dir,
41 .readdir = smb_readdir, 42 .readdir = smb_readdir,
42 .unlocked_ioctl = smb_ioctl, 43 .unlocked_ioctl = smb_ioctl,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS
30 bool "Squashfs XATTR support"
31 depends on SQUASHFS
32 default n
33 help
34 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page).
37
38 If unsure, say N.
39
29config SQUASHFS_EMBEDDED 40config SQUASHFS_EMBEDDED
30 41
31 bool "Additional option for memory-constrained systems" 42 bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
9
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/xattr.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h" 47#include "squashfs_fs_i.h"
47#include "squashfs.h" 48#include "squashfs.h"
49#include "xattr.h"
48 50
49/* 51/*
50 * Initialise VFS inode with the base inode information common to all 52 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
111 int err, type, offset = SQUASHFS_INODE_OFFSET(ino); 113 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
112 union squashfs_inode squashfs_ino; 114 union squashfs_inode squashfs_ino;
113 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; 115 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
116 int xattr_id = SQUASHFS_INVALID_XATTR;
114 117
115 TRACE("Entered squashfs_read_inode\n"); 118 TRACE("Entered squashfs_read_inode\n");
116 119
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
199 frag_offset = 0; 202 frag_offset = 0;
200 } 203 }
201 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
202 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
203 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops;
204 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
205 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
206 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
251 if (err < 0) 256 if (err < 0)
252 goto failed_read; 257 goto failed_read;
253 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
254 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
255 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
256 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
280 286
281 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
282 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
283 inode->i_op = &page_symlink_inode_operations; 289 inode->i_op = &squashfs_symlink_inode_ops;
284 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
285 inode->i_mode |= S_IFLNK; 291 inode->i_mode |= S_IFLNK;
286 squashfs_i(inode)->start = block; 292 squashfs_i(inode)->start = block;
287 squashfs_i(inode)->offset = offset; 293 squashfs_i(inode)->offset = offset;
288 294
295 if (type == SQUASHFS_LSYMLINK_TYPE) {
296 __le32 xattr;
297
298 err = squashfs_read_metadata(sb, NULL, &block,
299 &offset, inode->i_size);
300 if (err < 0)
301 goto failed_read;
302 err = squashfs_read_metadata(sb, &xattr, &block,
303 &offset, sizeof(xattr));
304 if (err < 0)
305 goto failed_read;
306 xattr_id = le32_to_cpu(xattr);
307 }
308
289 TRACE("Symbolic link inode %x:%x, start_block %llx, offset " 309 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
290 "%x\n", SQUASHFS_INODE_BLK(ino), offset, 310 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
291 block, offset); 311 block, offset);
292 break; 312 break;
293 } 313 }
294 case SQUASHFS_BLKDEV_TYPE: 314 case SQUASHFS_BLKDEV_TYPE:
295 case SQUASHFS_CHRDEV_TYPE: 315 case SQUASHFS_CHRDEV_TYPE: {
296 case SQUASHFS_LBLKDEV_TYPE:
297 case SQUASHFS_LCHRDEV_TYPE: {
298 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; 316 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
299 unsigned int rdev; 317 unsigned int rdev;
300 318
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
315 SQUASHFS_INODE_BLK(ino), offset, rdev); 333 SQUASHFS_INODE_BLK(ino), offset, rdev);
316 break; 334 break;
317 } 335 }
336 case SQUASHFS_LBLKDEV_TYPE:
337 case SQUASHFS_LCHRDEV_TYPE: {
338 struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
339 unsigned int rdev;
340
341 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
342 sizeof(*sqsh_ino));
343 if (err < 0)
344 goto failed_read;
345
346 if (type == SQUASHFS_LCHRDEV_TYPE)
347 inode->i_mode |= S_IFCHR;
348 else
349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355
356 TRACE("Device inode %x:%x, rdev %x\n",
357 SQUASHFS_INODE_BLK(ino), offset, rdev);
358 break;
359 }
318 case SQUASHFS_FIFO_TYPE: 360 case SQUASHFS_FIFO_TYPE:
319 case SQUASHFS_SOCKET_TYPE: 361 case SQUASHFS_SOCKET_TYPE: {
320 case SQUASHFS_LFIFO_TYPE:
321 case SQUASHFS_LSOCKET_TYPE: {
322 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; 362 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
323 363
324 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, 364 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
334 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
335 break; 375 break;
336 } 376 }
377 case SQUASHFS_LFIFO_TYPE:
378 case SQUASHFS_LSOCKET_TYPE: {
379 struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
380
381 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
382 sizeof(*sqsh_ino));
383 if (err < 0)
384 goto failed_read;
385
386 if (type == SQUASHFS_LFIFO_TYPE)
387 inode->i_mode |= S_IFIFO;
388 else
389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
393 init_special_inode(inode, inode->i_mode, 0);
394 break;
395 }
337 default: 396 default:
338 ERROR("Unknown inode type %d in squashfs_iget!\n", type); 397 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
339 return -EINVAL; 398 return -EINVAL;
340 } 399 }
341 400
401 if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
402 err = squashfs_xattr_lookup(sb, xattr_id,
403 &squashfs_i(inode)->xattr_count,
404 &squashfs_i(inode)->xattr_size,
405 &squashfs_i(inode)->xattr);
406 if (err < 0)
407 goto failed_read;
408 inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
409 + 1;
410 } else
411 squashfs_i(inode)->xattr_count = 0;
412
342 return 0; 413 return 0;
343 414
344failed_read: 415failed_read:
345 ERROR("Unable to read inode 0x%llx\n", ino); 416 ERROR("Unable to read inode 0x%llx\n", ino);
346 return err; 417 return err;
347} 418}
419
420
421const struct inode_operations squashfs_inode_ops = {
422 .getxattr = generic_getxattr,
423 .listxattr = squashfs_listxattr
424};
425
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/xattr.h>
60 61
61#include "squashfs_fs.h" 62#include "squashfs_fs.h"
62#include "squashfs_fs_sb.h" 63#include "squashfs_fs_sb.h"
63#include "squashfs_fs_i.h" 64#include "squashfs_fs_i.h"
64#include "squashfs.h" 65#include "squashfs.h"
66#include "xattr.h"
65 67
66/* 68/*
67 * Lookup name in the directory index, returning the location of the metadata 69 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
237 239
238 240
239const struct inode_operations squashfs_dir_inode_ops = { 241const struct inode_operations squashfs_dir_inode_ops = {
240 .lookup = squashfs_lookup 242 .lookup = squashfs_lookup,
243 .getxattr = generic_getxattr,
244 .listxattr = squashfs_listxattr
241}; 245};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
73 unsigned int); 73 unsigned int);
74extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
75 75
76/* xattr.c */
77extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
78
76/* 79/*
77 * Inodes, files and decompressor operations 80 * Inodes, files, decompressor and xattr operations
78 */ 81 */
79 82
80/* dir.c */ 83/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
86/* file.c */ 89/* file.c */
87extern const struct address_space_operations squashfs_aops; 90extern const struct address_space_operations squashfs_aops;
88 91
92/* inode.c */
93extern const struct inode_operations squashfs_inode_ops;
94
89/* namei.c */ 95/* namei.c */
90extern const struct inode_operations squashfs_dir_inode_ops; 96extern const struct inode_operations squashfs_dir_inode_ops;
91 97
92/* symlink.c */ 98/* symlink.c */
93extern const struct address_space_operations squashfs_symlink_aops; 99extern const struct address_space_operations squashfs_symlink_aops;
100extern const struct inode_operations squashfs_symlink_inode_ops;
101
102/* xattr.c */
103extern const struct xattr_handler *squashfs_xattr_handlers[];
94 104
95/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
46#define SQUASHFS_NAME_LEN 256 46#define SQUASHFS_NAME_LEN 256
47 47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU) 48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_XATTR (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL) 50#define SQUASHFS_INVALID_BLK (-1LL)
50 51
51/* Filesystem flags */ 52/* Filesystem flags */
@@ -96,6 +97,13 @@
96#define SQUASHFS_LFIFO_TYPE 13 97#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14 98#define SQUASHFS_LSOCKET_TYPE 14
98 99
100/* Xattr types */
101#define SQUASHFS_XATTR_USER 0
102#define SQUASHFS_XATTR_TRUSTED 1
103#define SQUASHFS_XATTR_SECURITY 2
104#define SQUASHFS_XATTR_VALUE_OOL 256
105#define SQUASHFS_XATTR_PREFIX_MASK 0xff
106
99/* Flag whether block is compressed or uncompressed, bit is set if block is 107/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */ 108 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15) 109#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
174 182
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 183#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64)) 184 sizeof(u64))
185/* xattr id lookup table defines */
186#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
187
188#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
189 SQUASHFS_METADATA_SIZE)
190
191#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
192 SQUASHFS_METADATA_SIZE)
193
194#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
195 SQUASHFS_METADATA_SIZE - 1) / \
196 SQUASHFS_METADATA_SIZE)
197
198#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
199 sizeof(u64))
200#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
201
202#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
177 203
178/* cached data constants for filesystem */ 204/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8 205#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
228 __le64 root_inode; 254 __le64 root_inode;
229 __le64 bytes_used; 255 __le64 bytes_used;
230 __le64 id_table_start; 256 __le64 id_table_start;
231 __le64 xattr_table_start; 257 __le64 xattr_id_table_start;
232 __le64 inode_table_start; 258 __le64 inode_table_start;
233 __le64 directory_table_start; 259 __le64 directory_table_start;
234 __le64 fragment_table_start; 260 __le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
261 __le32 nlink; 287 __le32 nlink;
262}; 288};
263 289
290struct squashfs_lipc_inode {
291 __le16 inode_type;
292 __le16 mode;
293 __le16 uid;
294 __le16 guid;
295 __le32 mtime;
296 __le32 inode_number;
297 __le32 nlink;
298 __le32 xattr;
299};
300
264struct squashfs_dev_inode { 301struct squashfs_dev_inode {
265 __le16 inode_type; 302 __le16 inode_type;
266 __le16 mode; 303 __le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
272 __le32 rdev; 309 __le32 rdev;
273}; 310};
274 311
312struct squashfs_ldev_inode {
313 __le16 inode_type;
314 __le16 mode;
315 __le16 uid;
316 __le16 guid;
317 __le32 mtime;
318 __le32 inode_number;
319 __le32 nlink;
320 __le32 rdev;
321 __le32 xattr;
322};
323
275struct squashfs_symlink_inode { 324struct squashfs_symlink_inode {
276 __le16 inode_type; 325 __le16 inode_type;
277 __le16 mode; 326 __le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
349union squashfs_inode { 398union squashfs_inode {
350 struct squashfs_base_inode base; 399 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev; 400 struct squashfs_dev_inode dev;
401 struct squashfs_ldev_inode ldev;
352 struct squashfs_symlink_inode symlink; 402 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg; 403 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg; 404 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir; 405 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir; 406 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc; 407 struct squashfs_ipc_inode ipc;
408 struct squashfs_lipc_inode lipc;
358}; 409};
359 410
360struct squashfs_dir_entry { 411struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
377 unsigned int unused; 428 unsigned int unused;
378}; 429};
379 430
431struct squashfs_xattr_entry {
432 __le16 type;
433 __le16 size;
434 char data[0];
435};
436
437struct squashfs_xattr_val {
438 __le32 vsize;
439 char value[0];
440};
441
442struct squashfs_xattr_id {
443 __le64 xattr;
444 __le32 count;
445 __le32 size;
446};
447
448struct squashfs_xattr_id_table {
449 __le64 xattr_table_start;
450 __le32 xattr_ids;
451 __le32 unused;
452};
453
380#endif 454#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
26struct squashfs_inode_info { 26struct squashfs_inode_info {
27 u64 start; 27 u64 start;
28 int offset; 28 int offset;
29 u64 xattr;
30 unsigned int xattr_size;
31 int xattr_count;
29 union { 32 union {
30 struct { 33 struct {
31 u64 fragment_block; 34 u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
61 int next_meta_index; 61 int next_meta_index;
62 __le64 *id_table; 62 __le64 *id_table;
63 __le64 *fragment_index; 63 __le64 *fragment_index;
64 __le64 *xattr_id_table;
64 struct mutex read_data_mutex; 65 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 66 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 67 struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
68 __le64 *inode_lookup_table; 69 __le64 *inode_lookup_table;
69 u64 inode_table; 70 u64 inode_table;
70 u64 directory_table; 71 u64 directory_table;
72 u64 xattr_table;
71 unsigned int block_size; 73 unsigned int block_size;
72 unsigned short block_log; 74 unsigned short block_log;
73 long long bytes_used; 75 long long bytes_used;
74 unsigned int inodes; 76 unsigned int inodes;
77 int xattr_ids;
75}; 78};
76#endif 79#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/magic.h> 38#include <linux/magic.h>
39#include <linux/xattr.h>
39 40
40#include "squashfs_fs.h" 41#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 42#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 43#include "squashfs_fs_i.h"
43#include "squashfs.h" 44#include "squashfs.h"
44#include "decompressor.h" 45#include "decompressor.h"
46#include "xattr.h"
45 47
46static struct file_system_type squashfs_fs_type; 48static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 49static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
82 long long root_inode; 84 long long root_inode;
83 unsigned short flags; 85 unsigned short flags;
84 unsigned int fragments; 86 unsigned int fragments;
85 u64 lookup_table_start; 87 u64 lookup_table_start, xattr_id_table_start;
86 int err; 88 int err;
87 89
88 TRACE("Entered squashfs_fill_superblock\n"); 90 TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
139 if (msblk->decompressor == NULL) 141 if (msblk->decompressor == NULL)
140 goto failed_mount; 142 goto failed_mount;
141 143
142 /*
143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored.
145 */
146 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
147 ERROR("Xattrs in filesystem, these will be ignored\n");
148
149 /* Check the filesystem does not extend beyond the end of the 144 /* Check the filesystem does not extend beyond the end of the
150 block device */ 145 block device */
151 msblk->bytes_used = le64_to_cpu(sblk->bytes_used); 146 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
253allocate_lookup_table: 248allocate_lookup_table:
254 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 249 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
255 if (lookup_table_start == SQUASHFS_INVALID_BLK) 250 if (lookup_table_start == SQUASHFS_INVALID_BLK)
256 goto allocate_root; 251 goto allocate_xattr_table;
257 252
258 /* Allocate and read inode lookup table */ 253 /* Allocate and read inode lookup table */
259 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, 254 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
266 261
267 sb->s_export_op = &squashfs_export_ops; 262 sb->s_export_op = &squashfs_export_ops;
268 263
264allocate_xattr_table:
265 sb->s_xattr = squashfs_xattr_handlers;
266 xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
267 if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
268 goto allocate_root;
269
270 /* Allocate and read xattr id lookup table */
271 msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
272 xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
273 if (IS_ERR(msblk->xattr_id_table)) {
274 err = PTR_ERR(msblk->xattr_id_table);
275 msblk->xattr_id_table = NULL;
276 if (err != -ENOTSUPP)
277 goto failed_mount;
278 }
269allocate_root: 279allocate_root:
270 root = new_inode(sb); 280 root = new_inode(sb);
271 if (!root) { 281 if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
301 kfree(msblk->inode_lookup_table); 311 kfree(msblk->inode_lookup_table);
302 kfree(msblk->fragment_index); 312 kfree(msblk->fragment_index);
303 kfree(msblk->id_table); 313 kfree(msblk->id_table);
314 kfree(msblk->xattr_id_table);
304 kfree(sb->s_fs_info); 315 kfree(sb->s_fs_info);
305 sb->s_fs_info = NULL; 316 sb->s_fs_info = NULL;
306 kfree(sblk); 317 kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
355 kfree(sbi->fragment_index); 366 kfree(sbi->fragment_index);
356 kfree(sbi->meta_index); 367 kfree(sbi->meta_index);
357 kfree(sbi->inode_lookup_table); 368 kfree(sbi->inode_lookup_table);
369 kfree(sbi->xattr_id_table);
358 kfree(sb->s_fs_info); 370 kfree(sb->s_fs_info);
359 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
360 } 372 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/xattr.h>
38 39
39#include "squashfs_fs.h" 40#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
42#include "squashfs.h" 43#include "squashfs.h"
44#include "xattr.h"
43 45
44static int squashfs_symlink_readpage(struct file *file, struct page *page) 46static int squashfs_symlink_readpage(struct file *file, struct page *page)
45{ 47{
@@ -114,3 +116,12 @@ error_out:
114const struct address_space_operations squashfs_symlink_aops = { 116const struct address_space_operations squashfs_symlink_aops = {
115 .readpage = squashfs_symlink_readpage 117 .readpage = squashfs_symlink_readpage
116}; 118};
119
120const struct inode_operations squashfs_symlink_inode_ops = {
121 .readlink = generic_readlink,
122 .follow_link = page_follow_link_light,
123 .put_link = page_put_link,
124 .getxattr = generic_getxattr,
125 .listxattr = squashfs_listxattr
126};
127
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/string.h>
27#include <linux/fs.h>
28#include <linux/vfs.h>
29#include <linux/xattr.h>
30#include <linux/slab.h>
31
32#include "squashfs_fs.h"
33#include "squashfs_fs_sb.h"
34#include "squashfs_fs_i.h"
35#include "squashfs.h"
36
37static const struct xattr_handler *squashfs_xattr_handler(int);
38
39ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
40 size_t buffer_size)
41{
42 struct inode *inode = d->d_inode;
43 struct super_block *sb = inode->i_sb;
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
46 + msblk->xattr_table;
47 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
48 int count = squashfs_i(inode)->xattr_count;
49 size_t rest = buffer_size;
50 int err;
51
52 /* check that the file system has xattrs */
53 if (msblk->xattr_id_table == NULL)
54 return -EOPNOTSUPP;
55
56 /* loop reading each xattr name */
57 while (count--) {
58 struct squashfs_xattr_entry entry;
59 struct squashfs_xattr_val val;
60 const struct xattr_handler *handler;
61 int name_size, prefix_size = 0;
62
63 err = squashfs_read_metadata(sb, &entry, &start, &offset,
64 sizeof(entry));
65 if (err < 0)
66 goto failed;
67
68 name_size = le16_to_cpu(entry.size);
69 handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
70 if (handler)
71 prefix_size = handler->list(d, buffer, rest, NULL,
72 name_size, handler->flags);
73 if (prefix_size) {
74 if (buffer) {
75 if (prefix_size + name_size + 1 > rest) {
76 err = -ERANGE;
77 goto failed;
78 }
79 buffer += prefix_size;
80 }
81 err = squashfs_read_metadata(sb, buffer, &start,
82 &offset, name_size);
83 if (err < 0)
84 goto failed;
85 if (buffer) {
86 buffer[name_size] = '\0';
87 buffer += name_size + 1;
88 }
89 rest -= prefix_size + name_size + 1;
90 } else {
91 /* no handler or insuffficient privileges, so skip */
92 err = squashfs_read_metadata(sb, NULL, &start,
93 &offset, name_size);
94 if (err < 0)
95 goto failed;
96 }
97
98
99 /* skip remaining xattr entry */
100 err = squashfs_read_metadata(sb, &val, &start, &offset,
101 sizeof(val));
102 if (err < 0)
103 goto failed;
104
105 err = squashfs_read_metadata(sb, NULL, &start, &offset,
106 le32_to_cpu(val.vsize));
107 if (err < 0)
108 goto failed;
109 }
110 err = buffer_size - rest;
111
112failed:
113 return err;
114}
115
116
117static int squashfs_xattr_get(struct inode *inode, int name_index,
118 const char *name, void *buffer, size_t buffer_size)
119{
120 struct super_block *sb = inode->i_sb;
121 struct squashfs_sb_info *msblk = sb->s_fs_info;
122 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
123 + msblk->xattr_table;
124 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
125 int count = squashfs_i(inode)->xattr_count;
126 int name_len = strlen(name);
127 int err, vsize;
128 char *target = kmalloc(name_len, GFP_KERNEL);
129
130 if (target == NULL)
131 return -ENOMEM;
132
133 /* loop reading each xattr name */
134 for (; count; count--) {
135 struct squashfs_xattr_entry entry;
136 struct squashfs_xattr_val val;
137 int type, prefix, name_size;
138
139 err = squashfs_read_metadata(sb, &entry, &start, &offset,
140 sizeof(entry));
141 if (err < 0)
142 goto failed;
143
144 name_size = le16_to_cpu(entry.size);
145 type = le16_to_cpu(entry.type);
146 prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
147
148 if (prefix == name_index && name_size == name_len)
149 err = squashfs_read_metadata(sb, target, &start,
150 &offset, name_size);
151 else
152 err = squashfs_read_metadata(sb, NULL, &start,
153 &offset, name_size);
154 if (err < 0)
155 goto failed;
156
157 if (prefix == name_index && name_size == name_len &&
158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr;
162 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val));
165 if (err < 0)
166 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start,
168 &offset, sizeof(xattr));
169 if (err < 0)
170 goto failed;
171 xattr = le64_to_cpu(xattr);
172 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr);
175 }
176 /* read xattr value */
177 err = squashfs_read_metadata(sb, &val, &start, &offset,
178 sizeof(val));
179 if (err < 0)
180 goto failed;
181
182 vsize = le32_to_cpu(val.vsize);
183 if (buffer) {
184 if (vsize > buffer_size) {
185 err = -ERANGE;
186 goto failed;
187 }
188 err = squashfs_read_metadata(sb, buffer, &start,
189 &offset, vsize);
190 if (err < 0)
191 goto failed;
192 }
193 break;
194 }
195
196 /* no match, skip remaining xattr entry */
197 err = squashfs_read_metadata(sb, &val, &start, &offset,
198 sizeof(val));
199 if (err < 0)
200 goto failed;
201 err = squashfs_read_metadata(sb, NULL, &start, &offset,
202 le32_to_cpu(val.vsize));
203 if (err < 0)
204 goto failed;
205 }
206 err = count ? vsize : -ENODATA;
207
208failed:
209 kfree(target);
210 return err;
211}
212
213
214/*
215 * User namespace support
216 */
217static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
218 const char *name, size_t name_len, int type)
219{
220 if (list && XATTR_USER_PREFIX_LEN <= list_size)
221 memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
222 return XATTR_USER_PREFIX_LEN;
223}
224
225static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
226 size_t size, int type)
227{
228 if (name[0] == '\0')
229 return -EINVAL;
230
231 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
232 buffer, size);
233}
234
235static const struct xattr_handler squashfs_xattr_user_handler = {
236 .prefix = XATTR_USER_PREFIX,
237 .list = squashfs_user_list,
238 .get = squashfs_user_get
239};
240
241/*
242 * Trusted namespace support
243 */
244static size_t squashfs_trusted_list(struct dentry *d, char *list,
245 size_t list_size, const char *name, size_t name_len, int type)
246{
247 if (!capable(CAP_SYS_ADMIN))
248 return 0;
249
250 if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
251 memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
252 return XATTR_TRUSTED_PREFIX_LEN;
253}
254
255static int squashfs_trusted_get(struct dentry *d, const char *name,
256 void *buffer, size_t size, int type)
257{
258 if (name[0] == '\0')
259 return -EINVAL;
260
261 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
262 buffer, size);
263}
264
265static const struct xattr_handler squashfs_xattr_trusted_handler = {
266 .prefix = XATTR_TRUSTED_PREFIX,
267 .list = squashfs_trusted_list,
268 .get = squashfs_trusted_get
269};
270
271/*
272 * Security namespace support
273 */
274static size_t squashfs_security_list(struct dentry *d, char *list,
275 size_t list_size, const char *name, size_t name_len, int type)
276{
277 if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
278 memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
279 return XATTR_SECURITY_PREFIX_LEN;
280}
281
282static int squashfs_security_get(struct dentry *d, const char *name,
283 void *buffer, size_t size, int type)
284{
285 if (name[0] == '\0')
286 return -EINVAL;
287
288 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
289 buffer, size);
290}
291
292static const struct xattr_handler squashfs_xattr_security_handler = {
293 .prefix = XATTR_SECURITY_PREFIX,
294 .list = squashfs_security_list,
295 .get = squashfs_security_get
296};
297
298static inline const struct xattr_handler *squashfs_xattr_handler(int type)
299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */
302 return NULL;
303
304 switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
305 case SQUASHFS_XATTR_USER:
306 return &squashfs_xattr_user_handler;
307 case SQUASHFS_XATTR_TRUSTED:
308 return &squashfs_xattr_trusted_handler;
309 case SQUASHFS_XATTR_SECURITY:
310 return &squashfs_xattr_security_handler;
311 default:
312 /* ignore unrecognised type */
313 return NULL;
314 }
315}
316
317const struct xattr_handler *squashfs_xattr_handlers[] = {
318 &squashfs_xattr_user_handler,
319 &squashfs_xattr_trusted_handler,
320 &squashfs_xattr_security_handler,
321 NULL
322};
323
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.h
22 */
23
24#ifdef CONFIG_SQUASHFS_XATTRS
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *);
29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids)
32{
33 ERROR("Xattrs in filesystem, these will be ignored\n");
34 return ERR_PTR(-ENOTSUPP);
35}
36
37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size,
39 unsigned long long *xattr)
40{
41 return 0;
42}
43#define squashfs_listxattr NULL
44#define generic_getxattr NULL
45#define squashfs_xattr_handlers NULL
46#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24/*
25 * This file implements code to map the 32-bit xattr id stored in the inode
26 * into the on disk location of the xattr data.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32
33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h"
37
38/*
39 * Map xattr id using the xattr id look up table
40 */
41int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
42 int *count, unsigned int *size, unsigned long long *xattr)
43{
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 int block = SQUASHFS_XATTR_BLOCK(index);
46 int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
47 u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
48 struct squashfs_xattr_id id;
49 int err;
50
51 err = squashfs_read_metadata(sb, &id, &start_block, &offset,
52 sizeof(id));
53 if (err < 0)
54 return err;
55
56 *xattr = le64_to_cpu(id.xattr);
57 *size = le32_to_cpu(id.size);
58 *count = le32_to_cpu(id.count);
59 return 0;
60}
61
62
63/*
64 * Read uncompressed xattr id lookup table indexes from disk into memory
65 */
66__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
67 u64 *xattr_table_start, int *xattr_ids)
68{
69 unsigned int len;
70 __le64 *xid_table;
71 struct squashfs_xattr_id_table id_table;
72 int err;
73
74 err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
75 if (err < 0) {
76 ERROR("unable to read xattr id table\n");
77 return ERR_PTR(err);
78 }
79 *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
80 *xattr_ids = le32_to_cpu(id_table.xattr_ids);
81 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
82
83 TRACE("In read_xattr_index_table, length %d\n", len);
84
85 /* Allocate xattr id lookup table indexes */
86 xid_table = kmalloc(len, GFP_KERNEL);
87 if (xid_table == NULL) {
88 ERROR("Failed to allocate xattr id index table\n");
89 return ERR_PTR(-ENOMEM);
90 }
91
92 err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
93 if (err < 0) {
94 ERROR("unable to read xattr id index table\n");
95 kfree(xid_table);
96 return ERR_PTR(err);
97 }
98
99 return xid_table;
100}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..1660c81ffa3d 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,6 +207,7 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
207 207
208/* readdir and lookup functions */ 208/* readdir and lookup functions */
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .llseek = generic_file_llseek,
210 .read = generic_read_dir, 211 .read = generic_read_dir,
211 .readdir = udf_readdir, 212 .readdir = udf_readdir,
212 .unlocked_ioctl = udf_ioctl, 213 .unlocked_ioctl = udf_ioctl,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..ad9bc1ebd3a6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -918,6 +918,7 @@ again:
918 sbi->s_bytesex = BYTESEX_LE; 918 sbi->s_bytesex = BYTESEX_LE;
919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
920 case UFS_MAGIC: 920 case UFS_MAGIC:
921 case UFS_MAGIC_BW:
921 case UFS2_MAGIC: 922 case UFS2_MAGIC:
922 case UFS_MAGIC_LFN: 923 case UFS_MAGIC_LFN:
923 case UFS_MAGIC_FEA: 924 case UFS_MAGIC_FEA:
@@ -927,6 +928,7 @@ again:
927 sbi->s_bytesex = BYTESEX_BE; 928 sbi->s_bytesex = BYTESEX_BE;
928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 929 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
929 case UFS_MAGIC: 930 case UFS_MAGIC:
931 case UFS_MAGIC_BW:
930 case UFS2_MAGIC: 932 case UFS2_MAGIC:
931 case UFS_MAGIC_LFN: 933 case UFS_MAGIC_LFN:
932 case UFS_MAGIC_FEA: 934 case UFS_MAGIC_FEA:
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
48#define UFS_SECTOR_SIZE 512 48#define UFS_SECTOR_SIZE 512
49#define UFS_SECTOR_BITS 9 49#define UFS_SECTOR_BITS 9
50#define UFS_MAGIC 0x00011954 50#define UFS_MAGIC 0x00011954
51#define UFS_MAGIC_BW 0x0f242697
51#define UFS2_MAGIC 0x19540119 52#define UFS2_MAGIC 0x19540119
52#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ 53#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
53 54