aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2009-01-14 00:29:08 -0500
committerLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2009-01-14 00:29:08 -0500
commitc088f4e9da74b901f7ed1749ad697d77622ed0f9 (patch)
tree84cddf20369f82f10c1c3712e6cce20dd1b9d863 /fs
parentce79735c12d62c3cda38eb31762cf98e87c7b087 (diff)
parenta6525042bfdfcab128bd91fad264de10fd24a55e (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig71
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_elf_fdpic.c35
-rw-r--r--fs/binfmt_flat.c34
-rw-r--r--fs/bio.c36
-rw-r--r--fs/block_dev.c31
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c351
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h131
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c709
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c3953
-rw-r--r--fs/btrfs/ctree.h2129
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2343
-rw-r--r--fs/btrfs/disk-io.h102
-rw-r--r--fs/btrfs/export.c203
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5986
-rw-r--r--fs/btrfs/extent_io.c3717
-rw-r--r--fs/btrfs/extent_io.h269
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c831
-rw-r--r--fs/btrfs/file.c1288
-rw-r--r--fs/btrfs/free-space-cache.c495
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c144
-rw-r--r--fs/btrfs/inode.c5035
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/ioctl.h67
-rw-r--r--fs/btrfs/locking.c88
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c730
-rw-r--r--fs/btrfs/ordered-data.h158
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c216
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c366
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c722
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1097
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c147
-rw-r--r--fs/btrfs/tree-log.c2898
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3218
-rw-r--r--fs/btrfs/volumes.h162
-rw-r--r--fs/btrfs/xattr.c322
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c632
-rw-r--r--fs/buffer.c74
-rw-r--r--fs/coda/sysctl.c5
-rw-r--r--fs/dcache.c14
-rw-r--r--fs/dlm/debug_fs.c696
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c26
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/recover.c10
-rw-r--r--fs/dquot.c2
-rw-r--r--fs/ext2/ialloc.c8
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/ioctl.c3
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext3/hash.c77
-rw-r--r--fs/ext3/ialloc.c8
-rw-r--r--fs/ext3/ioctl.c3
-rw-r--r--fs/ext3/namei.c15
-rw-r--r--fs/ext3/super.c88
-rw-r--r--fs/ext4/balloc.c293
-rw-r--r--fs/ext4/bitmap.c5
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h152
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_i.h16
-rw-r--r--fs/ext4/ext4_jbd2.c83
-rw-r--r--fs/ext4/ext4_jbd2.h87
-rw-r--r--fs/ext4/ext4_sb.h6
-rw-r--r--fs/ext4/extents.c62
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/hash.c77
-rw-r--r--fs/ext4/ialloc.c324
-rw-r--r--fs/ext4/inode.c309
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c629
-rw-r--r--fs/ext4/mballoc.h71
-rw-r--r--fs/ext4/migrate.c19
-rw-r--r--fs/ext4/namei.c96
-rw-r--r--fs/ext4/resize.c113
-rw-r--r--fs/ext4/super.c663
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/gfs2/ops_super.c16
-rw-r--r--fs/inode.c19
-rw-r--r--fs/ioctl.c46
-rw-r--r--fs/ioprio.c3
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/transaction.c39
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c58
-rw-r--r--fs/jbd2/journal.c124
-rw-r--r--fs/jbd2/transaction.c60
-rw-r--r--fs/jffs2/compr_rubin.c120
-rw-r--r--fs/jffs2/erase.c5
-rw-r--r--fs/jffs2/nodelist.h3
-rw-r--r--fs/jfs/super.c10
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/nommu.c71
-rw-r--r--fs/proc/task_nommu.c120
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/ramfs/file-nommu.c21
-rw-r--r--fs/reiserfs/super.c10
-rw-r--r--fs/romfs/inode.c12
-rw-r--r--fs/select.c2
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/Makefile8
-rw-r--r--fs/squashfs/block.c274
-rw-r--r--fs/squashfs/cache.c412
-rw-r--r--fs/squashfs/dir.c235
-rw-r--r--fs/squashfs/export.c155
-rw-r--r--fs/squashfs/file.c502
-rw-r--r--fs/squashfs/fragment.c98
-rw-r--r--fs/squashfs/id.c94
-rw-r--r--fs/squashfs/inode.c346
-rw-r--r--fs/squashfs/namei.c242
-rw-r--r--fs/squashfs/squashfs.h90
-rw-r--r--fs/squashfs/squashfs_fs.h381
-rw-r--r--fs/squashfs/squashfs_fs_i.h45
-rw-r--r--fs/squashfs/squashfs_fs_sb.h76
-rw-r--r--fs/squashfs/super.c440
-rw-r--r--fs/squashfs/symlink.c118
-rw-r--r--fs/super.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c8
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c11
-rw-r--r--fs/xfs/xfs_fsops.h2
154 files changed, 49038 insertions, 2025 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 32883589ee54..51307b0fdf0f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -269,6 +269,25 @@ config OCFS2_FS_POSIX_ACL
269 Posix Access Control Lists (ACLs) support permissions for users and 269 Posix Access Control Lists (ACLs) support permissions for users and
270 groups beyond the owner/group/world scheme. 270 groups beyond the owner/group/world scheme.
271 271
272config BTRFS_FS
273 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
274 depends on EXPERIMENTAL
275 select LIBCRC32C
276 select ZLIB_INFLATE
277 select ZLIB_DEFLATE
278 help
279 Btrfs is a new filesystem with extents, writable snapshotting,
280 support for multiple devices and many more features.
281
282 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
283 FINALIZED. You should say N here unless you are interested in
284 testing Btrfs with non-critical data.
285
286 To compile this file system support as a module, choose M here. The
287 module will be called btrfs.
288
289 If unsure, say N.
290
272endif # BLOCK 291endif # BLOCK
273 292
274source "fs/notify/Kconfig" 293source "fs/notify/Kconfig"
@@ -913,6 +932,58 @@ config CRAMFS
913 932
914 If unsure, say N. 933 If unsure, say N.
915 934
935config SQUASHFS
936 tristate "SquashFS 4.0 - Squashed file system support"
937 depends on BLOCK
938 select ZLIB_INFLATE
939 help
940 Saying Y here includes support for SquashFS 4.0 (a Compressed
941 Read-Only File System). Squashfs is a highly compressed read-only
942 filesystem for Linux. It uses zlib compression to compress both
943 files, inodes and directories. Inodes in the system are very small
944 and all blocks are packed to minimise data overhead. Block sizes
945 greater than 4K are supported up to a maximum of 1 Mbytes (default
946 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
947 (larger than 4GB), full uid/gid information, hard links and
948 timestamps.
949
950 Squashfs is intended for general read-only filesystem use, for
951 archival use (i.e. in cases where a .tar.gz file may be used), and in
952 embedded systems where low overhead is needed. Further information
953 and tools are available from http://squashfs.sourceforge.net.
954
955 If you want to compile this as a module ( = code which can be
956 inserted in and removed from the running kernel whenever you want),
957 say M here and read <file:Documentation/modules.txt>. The module
958 will be called squashfs. Note that the root file system (the one
959 containing the directory /) cannot be compiled as a module.
960
961 If unsure, say N.
962
963config SQUASHFS_EMBEDDED
964
965 bool "Additional option for memory-constrained systems"
966 depends on SQUASHFS
967 default n
968 help
969 Saying Y here allows you to specify cache size.
970
971 If unsure, say N.
972
973config SQUASHFS_FRAGMENT_CACHE_SIZE
974 int "Number of fragments cached" if SQUASHFS_EMBEDDED
975 depends on SQUASHFS
976 default "3"
977 help
978 By default SquashFS caches the last 3 fragments read from
979 the filesystem. Increasing this amount may mean SquashFS
980 has to re-read fragments less often from disk, at the expense
981 of extra system memory. Decreasing this amount will mean
982 SquashFS uses less memory at the expense of extra reads from disk.
983
984 Note there must be at least one cached fragment. Anything
985 much more than three will probably not make much difference.
986
916config VXFS_FS 987config VXFS_FS
917 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" 988 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
918 depends on BLOCK 989 depends on BLOCK
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index ce9fb3fbfae4..bb4cc5b8abc8 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default n
46 depends on BINFMT_ELF 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
49 process, and can contain or omit the memory contents of each one. 49 process, and can contain or omit the memory contents of each one.
diff --git a/fs/Makefile b/fs/Makefile
index c830611550d3..38bc735c67ad 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 74obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/ 75obj-$(CONFIG_EXT2_FS) += ext2/
76obj-$(CONFIG_CRAMFS) += cramfs/ 76obj-$(CONFIG_CRAMFS) += cramfs/
77obj-$(CONFIG_SQUASHFS) += squashfs/
77obj-y += ramfs/ 78obj-y += ramfs/
78obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ 79obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
79obj-$(CONFIG_CODA_FS) += coda/ 80obj-$(CONFIG_CODA_FS) += coda/
@@ -119,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/
119obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
120obj-$(CONFIG_DEBUG_FS) += debugfs/ 121obj-$(CONFIG_DEBUG_FS) += debugfs/
121obj-$(CONFIG_OCFS2_FS) += ocfs2/ 122obj-$(CONFIG_OCFS2_FS) += ocfs2/
123obj-$(CONFIG_BTRFS_FS) += btrfs/
122obj-$(CONFIG_GFS2_FS) += gfs2/ 124obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
152 elf_addr_t __user *sp; 152 elf_addr_t __user *sp;
153 elf_addr_t __user *u_platform; 153 elf_addr_t __user *u_platform;
154 elf_addr_t __user *u_base_platform; 154 elf_addr_t __user *u_base_platform;
155 elf_addr_t __user *u_rand_bytes;
155 const char *k_platform = ELF_PLATFORM; 156 const char *k_platform = ELF_PLATFORM;
156 const char *k_base_platform = ELF_BASE_PLATFORM; 157 const char *k_base_platform = ELF_BASE_PLATFORM;
158 unsigned char k_rand_bytes[16];
157 int items; 159 int items;
158 elf_addr_t *elf_info; 160 elf_addr_t *elf_info;
159 int ei_index = 0; 161 int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
196 return -EFAULT; 198 return -EFAULT;
197 } 199 }
198 200
201 /*
202 * Generate 16 random bytes for userspace PRNG seeding.
203 */
204 get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
205 u_rand_bytes = (elf_addr_t __user *)
206 STACK_ALLOC(p, sizeof(k_rand_bytes));
207 if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
208 return -EFAULT;
209
199 /* Create the ELF interpreter info */ 210 /* Create the ELF interpreter info */
200 elf_info = (elf_addr_t *)current->mm->saved_auxv; 211 elf_info = (elf_addr_t *)current->mm->saved_auxv;
201 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ 212 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
228 NEW_AUX_ENT(AT_GID, cred->gid); 239 NEW_AUX_ENT(AT_GID, cred->gid);
229 NEW_AUX_ENT(AT_EGID, cred->egid); 240 NEW_AUX_ENT(AT_EGID, cred->egid);
230 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); 241 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
242 NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
231 NEW_AUX_ENT(AT_EXECFN, bprm->exec); 243 NEW_AUX_ENT(AT_EXECFN, bprm->exec);
232 if (k_platform) { 244 if (k_platform) {
233 NEW_AUX_ENT(AT_PLATFORM, 245 NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..f3e72c5c19f5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
168 struct elf_fdpic_params exec_params, interp_params; 168 struct elf_fdpic_params exec_params, interp_params;
169 struct elf_phdr *phdr; 169 struct elf_phdr *phdr;
170 unsigned long stack_size, entryaddr; 170 unsigned long stack_size, entryaddr;
171#ifndef CONFIG_MMU
172 unsigned long fullsize;
173#endif
174#ifdef ELF_FDPIC_PLAT_INIT 171#ifdef ELF_FDPIC_PLAT_INIT
175 unsigned long dynaddr; 172 unsigned long dynaddr;
176#endif 173#endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
390 goto error_kill; 387 goto error_kill;
391 } 388 }
392 389
393 /* expand the stack mapping to use up the entire allocation granule */
394 fullsize = kobjsize((char *) current->mm->start_brk);
395 if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
396 fullsize, 0, 0)))
397 stack_size = fullsize;
398 up_write(&current->mm->mmap_sem); 390 up_write(&current->mm->mmap_sem);
399 391
400 current->mm->brk = current->mm->start_brk; 392 current->mm->brk = current->mm->start_brk;
@@ -1567,11 +1559,9 @@ end_coredump:
1567static int elf_fdpic_dump_segments(struct file *file, size_t *size, 1559static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1568 unsigned long *limit, unsigned long mm_flags) 1560 unsigned long *limit, unsigned long mm_flags)
1569{ 1561{
1570 struct vm_list_struct *vml; 1562 struct vm_area_struct *vma;
1571
1572 for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
1573 struct vm_area_struct *vma = vml->vma;
1574 1563
1564 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1575 if (!maydump(vma, mm_flags)) 1565 if (!maydump(vma, mm_flags))
1576 continue; 1566 continue;
1577 1567
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1617 elf_fpxregset_t *xfpu = NULL; 1607 elf_fpxregset_t *xfpu = NULL;
1618#endif 1608#endif
1619 int thread_status_size = 0; 1609 int thread_status_size = 0;
1620#ifndef CONFIG_MMU
1621 struct vm_list_struct *vml;
1622#endif
1623 elf_addr_t *auxv; 1610 elf_addr_t *auxv;
1624 unsigned long mm_flags; 1611 unsigned long mm_flags;
1625 1612
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1685 fill_prstatus(prstatus, current, signr); 1672 fill_prstatus(prstatus, current, signr);
1686 elf_core_copy_regs(&prstatus->pr_reg, regs); 1673 elf_core_copy_regs(&prstatus->pr_reg, regs);
1687 1674
1688#ifdef CONFIG_MMU
1689 segs = current->mm->map_count; 1675 segs = current->mm->map_count;
1690#else
1691 segs = 0;
1692 for (vml = current->mm->context.vmlist; vml; vml = vml->next)
1693 segs++;
1694#endif
1695#ifdef ELF_CORE_EXTRA_PHDRS 1676#ifdef ELF_CORE_EXTRA_PHDRS
1696 segs += ELF_CORE_EXTRA_PHDRS; 1677 segs += ELF_CORE_EXTRA_PHDRS;
1697#endif 1678#endif
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1766 mm_flags = current->mm->flags; 1747 mm_flags = current->mm->flags;
1767 1748
1768 /* write program headers for segments dump */ 1749 /* write program headers for segments dump */
1769 for ( 1750 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1770#ifdef CONFIG_MMU
1771 vma = current->mm->mmap; vma; vma = vma->vm_next
1772#else
1773 vml = current->mm->context.vmlist; vml; vml = vml->next
1774#endif
1775 ) {
1776 struct elf_phdr phdr; 1751 struct elf_phdr phdr;
1777 size_t sz; 1752 size_t sz;
1778 1753
1779#ifndef CONFIG_MMU
1780 vma = vml->vma;
1781#endif
1782
1783 sz = vma->vm_end - vma->vm_start; 1754 sz = vma->vm_end - vma->vm_start;
1784 1755
1785 phdr.p_type = PT_LOAD; 1756 phdr.p_type = PT_LOAD;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b3725..5cebf0b37798 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
417 unsigned long textpos = 0, datapos = 0, result; 417 unsigned long textpos = 0, datapos = 0, result;
418 unsigned long realdatastart = 0; 418 unsigned long realdatastart = 0;
419 unsigned long text_len, data_len, bss_len, stack_len, flags; 419 unsigned long text_len, data_len, bss_len, stack_len, flags;
420 unsigned long len, reallen, memp = 0; 420 unsigned long len, memp = 0;
421 unsigned long extra, rlim; 421 unsigned long memp_size, extra, rlim;
422 unsigned long *reloc = 0, *rp; 422 unsigned long *reloc = 0, *rp;
423 struct inode *inode; 423 struct inode *inode;
424 int i, rev, relocs = 0; 424 int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
543 } 543 }
544 544
545 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); 545 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
546 len = PAGE_ALIGN(len);
546 down_write(&current->mm->mmap_sem); 547 down_write(&current->mm->mmap_sem);
547 realdatastart = do_mmap(0, 0, len, 548 realdatastart = do_mmap(0, 0, len,
548 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 549 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
549 /* Remap to use all availabe slack region space */
550 if (realdatastart && (realdatastart < (unsigned long)-4096)) {
551 reallen = kobjsize((void *)realdatastart);
552 if (reallen > len) {
553 realdatastart = do_mremap(realdatastart, len,
554 reallen, MREMAP_FIXED, realdatastart);
555 }
556 }
557 up_write(&current->mm->mmap_sem); 550 up_write(&current->mm->mmap_sem);
558 551
559 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 552 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
591 584
592 reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); 585 reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
593 memp = realdatastart; 586 memp = realdatastart;
594 587 memp_size = len;
595 } else { 588 } else {
596 589
597 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); 590 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
591 len = PAGE_ALIGN(len);
598 down_write(&current->mm->mmap_sem); 592 down_write(&current->mm->mmap_sem);
599 textpos = do_mmap(0, 0, len, 593 textpos = do_mmap(0, 0, len,
600 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 594 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
601 /* Remap to use all availabe slack region space */
602 if (textpos && (textpos < (unsigned long) -4096)) {
603 reallen = kobjsize((void *)textpos);
604 if (reallen > len) {
605 textpos = do_mremap(textpos, len, reallen,
606 MREMAP_FIXED, textpos);
607 }
608 }
609 up_write(&current->mm->mmap_sem); 595 up_write(&current->mm->mmap_sem);
610 596
611 if (!textpos || textpos >= (unsigned long) -4096) { 597 if (!textpos || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
622 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + 608 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
623 MAX_SHARED_LIBS * sizeof(unsigned long)); 609 MAX_SHARED_LIBS * sizeof(unsigned long));
624 memp = textpos; 610 memp = textpos;
625 611 memp_size = len;
626#ifdef CONFIG_BINFMT_ZFLAT 612#ifdef CONFIG_BINFMT_ZFLAT
627 /* 613 /*
628 * load it all in and treat it like a RAM load from now on 614 * load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
680 * set up the brk stuff, uses any slack left in data/bss/stack 666 * set up the brk stuff, uses any slack left in data/bss/stack
681 * allocation. We put the brk after the bss (between the bss 667 * allocation. We put the brk after the bss (between the bss
682 * and stack) like other platforms. 668 * and stack) like other platforms.
669 * Userspace code relies on the stack pointer starting out at
670 * an address right at the end of a page.
683 */ 671 */
684 current->mm->start_brk = datapos + data_len + bss_len; 672 current->mm->start_brk = datapos + data_len + bss_len;
685 current->mm->brk = (current->mm->start_brk + 3) & ~3; 673 current->mm->brk = (current->mm->start_brk + 3) & ~3;
686 current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; 674 current->mm->context.end_brk = memp + memp_size - stack_len;
687 } 675 }
688 676
689 if (flags & FLAT_FLAG_KTRACE) 677 if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
790 778
791 /* zero the BSS, BRK and stack areas */ 779 /* zero the BSS, BRK and stack areas */
792 memset((void*)(datapos + data_len), 0, bss_len + 780 memset((void*)(datapos + data_len), 0, bss_len +
793 (memp + kobjsize((void *) memp) - stack_len - /* end brk */ 781 (memp + memp_size - stack_len - /* end brk */
794 libinfo->lib_list[id].start_brk) + /* start brk */ 782 libinfo->lib_list[id].start_brk) + /* start brk */
795 stack_len); 783 stack_len);
796 784
797 return 0; 785 return 0;
diff --git a/fs/bio.c b/fs/bio.c
index 711cee103602..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
788 int i, ret; 788 int i, ret;
789 int nr_pages = 0; 789 int nr_pages = 0;
790 unsigned int len = 0; 790 unsigned int len = 0;
791 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
791 792
792 for (i = 0; i < iov_count; i++) { 793 for (i = 0; i < iov_count; i++) {
793 unsigned long uaddr; 794 unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
814 bio->bi_rw |= (!write_to_vm << BIO_RW); 815 bio->bi_rw |= (!write_to_vm << BIO_RW);
815 816
816 ret = 0; 817 ret = 0;
817 i = 0; 818
819 if (map_data) {
820 nr_pages = 1 << map_data->page_order;
821 i = map_data->offset / PAGE_SIZE;
822 }
818 while (len) { 823 while (len) {
819 unsigned int bytes; 824 unsigned int bytes = PAGE_SIZE;
820 825
821 if (map_data) 826 bytes -= offset;
822 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
823 else
824 bytes = PAGE_SIZE;
825 827
826 if (bytes > len) 828 if (bytes > len)
827 bytes = len; 829 bytes = len;
828 830
829 if (map_data) { 831 if (map_data) {
830 if (i == map_data->nr_entries) { 832 if (i == map_data->nr_entries * nr_pages) {
831 ret = -ENOMEM; 833 ret = -ENOMEM;
832 break; 834 break;
833 } 835 }
834 page = map_data->pages[i++]; 836
835 } else 837 page = map_data->pages[i / nr_pages];
838 page += (i % nr_pages);
839
840 i++;
841 } else {
836 page = alloc_page(q->bounce_gfp | gfp_mask); 842 page = alloc_page(q->bounce_gfp | gfp_mask);
837 if (!page) { 843 if (!page) {
838 ret = -ENOMEM; 844 ret = -ENOMEM;
839 break; 845 break;
846 }
840 } 847 }
841 848
842 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 849 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
843 break; 850 break;
844 851
845 len -= bytes; 852 len -= bytes;
853 offset = 0;
846 } 854 }
847 855
848 if (ret) 856 if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
851 /* 859 /*
852 * success 860 * success
853 */ 861 */
854 if (!write_to_vm) { 862 if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
855 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); 863 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
856 if (ret) 864 if (ret)
857 goto cleanup; 865 goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b957717e25ab..b3c1efff5e1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
285 INIT_LIST_HEAD(&bdev->bd_holder_list); 285 INIT_LIST_HEAD(&bdev->bd_holder_list);
286#endif 286#endif
287 inode_init_once(&ei->vfs_inode); 287 inode_init_once(&ei->vfs_inode);
288 /* Initialize mutex for freeze. */
289 mutex_init(&bdev->bd_fsfreeze_mutex);
288} 290}
289 291
290static inline void __bd_forget(struct inode *inode) 292static inline void __bd_forget(struct inode *inode)
@@ -1005,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1005 } 1007 }
1006 1008
1007 lock_kernel(); 1009 lock_kernel();
1010 restart:
1008 1011
1009 ret = -ENXIO; 1012 ret = -ENXIO;
1010 disk = get_gendisk(bdev->bd_dev, &partno); 1013 disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1025 1028
1026 if (disk->fops->open) { 1029 if (disk->fops->open) {
1027 ret = disk->fops->open(bdev, mode); 1030 ret = disk->fops->open(bdev, mode);
1031 if (ret == -ERESTARTSYS) {
1032 /* Lost a race with 'disk' being
1033 * deleted, try again.
1034 * See md.c
1035 */
1036 disk_put_part(bdev->bd_part);
1037 bdev->bd_part = NULL;
1038 module_put(disk->fops->owner);
1039 put_disk(disk);
1040 bdev->bd_disk = NULL;
1041 mutex_unlock(&bdev->bd_mutex);
1042 goto restart;
1043 }
1028 if (ret) 1044 if (ret)
1029 goto out_clear; 1045 goto out_clear;
1030 } 1046 }
@@ -1220,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1220 return blkdev_ioctl(bdev, mode, cmd, arg); 1236 return blkdev_ioctl(bdev, mode, cmd, arg);
1221} 1237}
1222 1238
1239/*
1240 * Try to release a page associated with block device when the system
1241 * is under memory pressure.
1242 */
1243static int blkdev_releasepage(struct page *page, gfp_t wait)
1244{
1245 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1246
1247 if (super && super->s_op->bdev_try_to_free_page)
1248 return super->s_op->bdev_try_to_free_page(super, page, wait);
1249
1250 return try_to_free_buffers(page);
1251}
1252
1223static const struct address_space_operations def_blk_aops = { 1253static const struct address_space_operations def_blk_aops = {
1224 .readpage = blkdev_readpage, 1254 .readpage = blkdev_readpage,
1225 .writepage = blkdev_writepage, 1255 .writepage = blkdev_writepage,
@@ -1227,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = {
1227 .write_begin = blkdev_write_begin, 1257 .write_begin = blkdev_write_begin,
1228 .write_end = blkdev_write_end, 1258 .write_end = blkdev_write_end,
1229 .writepages = generic_writepages, 1259 .writepages = generic_writepages,
1260 .releasepage = blkdev_releasepage,
1230 .direct_IO = blkdev_direct_IO, 1261 .direct_IO = blkdev_direct_IO,
1231}; 1262};
1232 1263
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 kfree(value);
165
166 if (!ret)
167 btrfs_update_cached_acl(inode, p_acl, acl);
168
169 return ret;
170}
171
172static int btrfs_xattr_set_acl(struct inode *inode, int type,
173 const void *value, size_t size)
174{
175 int ret = 0;
176 struct posix_acl *acl = NULL;
177
178 if (value) {
179 acl = posix_acl_from_xattr(value, size);
180 if (acl == NULL) {
181 value = NULL;
182 size = 0;
183 } else if (IS_ERR(acl)) {
184 return PTR_ERR(acl);
185 }
186 }
187
188 ret = btrfs_set_acl(inode, acl, type);
189
190 posix_acl_release(acl);
191
192 return ret;
193}
194
195
196static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
197 void *value, size_t size)
198{
199 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
200}
201
202static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
203 const void *value, size_t size, int flags)
204{
205 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
206}
207
208static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
209 void *value, size_t size)
210{
211 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
212}
213
214static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
215 const void *value, size_t size, int flags)
216{
217 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
218}
219
220int btrfs_check_acl(struct inode *inode, int mask)
221{
222 struct posix_acl *acl;
223 int error = -EAGAIN;
224
225 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
226
227 if (IS_ERR(acl))
228 return PTR_ERR(acl);
229 if (acl) {
230 error = posix_acl_permission(inode, acl, mask);
231 posix_acl_release(acl);
232 }
233
234 return error;
235}
236
237/*
238 * btrfs_init_acl is already generally called under fs_mutex, so the locking
239 * stuff has been fixed to work with that. If the locking stuff changes, we
240 * need to re-evaluate the acl locking stuff.
241 */
242int btrfs_init_acl(struct inode *inode, struct inode *dir)
243{
244 struct posix_acl *acl = NULL;
245 int ret = 0;
246
247 /* this happens with subvols */
248 if (!dir)
249 return 0;
250
251 if (!S_ISLNK(inode->i_mode)) {
252 if (IS_POSIXACL(dir)) {
253 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
254 if (IS_ERR(acl))
255 return PTR_ERR(acl);
256 }
257
258 if (!acl)
259 inode->i_mode &= ~current->fs->umask;
260 }
261
262 if (IS_POSIXACL(dir) && acl) {
263 struct posix_acl *clone;
264 mode_t mode;
265
266 if (S_ISDIR(inode->i_mode)) {
267 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
268 if (ret)
269 goto failed;
270 }
271 clone = posix_acl_clone(acl, GFP_NOFS);
272 ret = -ENOMEM;
273 if (!clone)
274 goto failed;
275
276 mode = inode->i_mode;
277 ret = posix_acl_create_masq(clone, &mode);
278 if (ret >= 0) {
279 inode->i_mode = mode;
280 if (ret > 0) {
281 /* we need an acl */
282 ret = btrfs_set_acl(inode, clone,
283 ACL_TYPE_ACCESS);
284 }
285 }
286 }
287failed:
288 posix_acl_release(acl);
289
290 return ret;
291}
292
293int btrfs_acl_chmod(struct inode *inode)
294{
295 struct posix_acl *acl, *clone;
296 int ret = 0;
297
298 if (S_ISLNK(inode->i_mode))
299 return -EOPNOTSUPP;
300
301 if (!IS_POSIXACL(inode))
302 return 0;
303
304 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
305 if (IS_ERR(acl) || !acl)
306 return PTR_ERR(acl);
307
308 clone = posix_acl_clone(acl, GFP_KERNEL);
309 posix_acl_release(acl);
310 if (!clone)
311 return -ENOMEM;
312
313 ret = posix_acl_chmod_masq(clone, inode->i_mode);
314 if (!ret)
315 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
316
317 posix_acl_release(clone);
318
319 return ret;
320}
321
322struct xattr_handler btrfs_xattr_acl_default_handler = {
323 .prefix = POSIX_ACL_XATTR_DEFAULT,
324 .get = btrfs_xattr_acl_default_get,
325 .set = btrfs_xattr_acl_default_set,
326};
327
328struct xattr_handler btrfs_xattr_acl_access_handler = {
329 .prefix = POSIX_ACL_XATTR_ACCESS,
330 .get = btrfs_xattr_acl_access_get,
331 .set = btrfs_xattr_acl_access_set,
332};
333
334#else /* CONFIG_FS_POSIX_ACL */
335
336int btrfs_acl_chmod(struct inode *inode)
337{
338 return 0;
339}
340
341int btrfs_init_acl(struct inode *inode, struct inode *dir)
342{
343 return 0;
344}
345
346int btrfs_check_acl(struct inode *inode, int mask)
347{
348 return 0;
349}
350
351#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2
29
30/*
31 * container for the kthread task pointer and the list of pending work
32 * One of these is allocated per thread.
33 */
34struct btrfs_worker_thread {
35 /* pool we belong to */
36 struct btrfs_workers *workers;
37
38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending;
40
41 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list;
43
44 /* kthread */
45 struct task_struct *task;
46
47 /* number of things on the pending list */
48 atomic_t num_pending;
49
50 unsigned long sequence;
51
52 /* protects the pending list. */
53 spinlock_t lock;
54
55 /* set to non-zero when this thread is already awake and kicking */
56 int working;
57
58 /* are we currently idle */
59 int idle;
60};
61
62/*
63 * helper function to move a thread onto the idle list after it
64 * has finished some requests.
65 */
66static void check_idle_worker(struct btrfs_worker_thread *worker)
67{
68 if (!worker->idle && atomic_read(&worker->num_pending) <
69 worker->workers->idle_thresh / 2) {
70 unsigned long flags;
71 spin_lock_irqsave(&worker->workers->lock, flags);
72 worker->idle = 1;
73 list_move(&worker->worker_list, &worker->workers->idle_list);
74 spin_unlock_irqrestore(&worker->workers->lock, flags);
75 }
76}
77
78/*
79 * helper function to move a thread off the idle list after new
80 * pending work is added.
81 */
82static void check_busy_worker(struct btrfs_worker_thread *worker)
83{
84 if (worker->idle && atomic_read(&worker->num_pending) >=
85 worker->workers->idle_thresh) {
86 unsigned long flags;
87 spin_lock_irqsave(&worker->workers->lock, flags);
88 worker->idle = 0;
89 list_move_tail(&worker->worker_list,
90 &worker->workers->worker_list);
91 spin_unlock_irqrestore(&worker->workers->lock, flags);
92 }
93}
94
95static noinline int run_ordered_completions(struct btrfs_workers *workers,
96 struct btrfs_work *work)
97{
98 unsigned long flags;
99
100 if (!workers->ordered)
101 return 0;
102
103 set_bit(WORK_DONE_BIT, &work->flags);
104
105 spin_lock_irqsave(&workers->lock, flags);
106
107 while (!list_empty(&workers->order_list)) {
108 work = list_entry(workers->order_list.next,
109 struct btrfs_work, order_list);
110
111 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break;
113
114 /* we are going to call the ordered done function, but
115 * we leave the work item on the list as a barrier so
116 * that later work items that are done don't have their
117 * functions called before this one returns
118 */
119 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
120 break;
121
122 spin_unlock_irqrestore(&workers->lock, flags);
123
124 work->ordered_func(work);
125
126 /* now take the lock again and call the freeing code */
127 spin_lock_irqsave(&workers->lock, flags);
128 list_del(&work->order_list);
129 work->ordered_free(work);
130 }
131
132 spin_unlock_irqrestore(&workers->lock, flags);
133 return 0;
134}
135
136/*
137 * main loop for servicing work items
138 */
139static int worker_loop(void *arg)
140{
141 struct btrfs_worker_thread *worker = arg;
142 struct list_head *cur;
143 struct btrfs_work *work;
144 do {
145 spin_lock_irq(&worker->lock);
146 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags);
151
152 work->worker = worker;
153 spin_unlock_irq(&worker->lock);
154
155 work->func(work);
156
157 atomic_dec(&worker->num_pending);
158 /*
159 * unless this is an ordered work queue,
160 * 'work' was probably freed by func above.
161 */
162 run_ordered_completions(worker->workers, work);
163
164 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker);
166
167 }
168 worker->working = 0;
169 if (freezing(current)) {
170 refrigerator();
171 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop())
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 }
178 } while (!kthread_should_stop());
179 return 0;
180}
181
182/*
183 * this will wait for all the worker threads to shutdown
184 */
185int btrfs_stop_workers(struct btrfs_workers *workers)
186{
187 struct list_head *cur;
188 struct btrfs_worker_thread *worker;
189
190 list_splice_init(&workers->idle_list, &workers->worker_list);
191 while (!list_empty(&workers->worker_list)) {
192 cur = workers->worker_list.next;
193 worker = list_entry(cur, struct btrfs_worker_thread,
194 worker_list);
195 kthread_stop(worker->task);
196 list_del(&worker->worker_list);
197 kfree(worker);
198 }
199 return 0;
200}
201
202/*
203 * simple init on struct btrfs_workers
204 */
205void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
206{
207 workers->num_workers = 0;
208 INIT_LIST_HEAD(&workers->worker_list);
209 INIT_LIST_HEAD(&workers->idle_list);
210 INIT_LIST_HEAD(&workers->order_list);
211 spin_lock_init(&workers->lock);
212 workers->max_workers = max;
213 workers->idle_thresh = 32;
214 workers->name = name;
215 workers->ordered = 0;
216}
217
218/*
219 * starts new worker threads. This does not enforce the max worker
220 * count in case you need to temporarily go past it.
221 */
222int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
223{
224 struct btrfs_worker_thread *worker;
225 int ret = 0;
226 int i;
227
228 for (i = 0; i < num_workers; i++) {
229 worker = kzalloc(sizeof(*worker), GFP_NOFS);
230 if (!worker) {
231 ret = -ENOMEM;
232 goto fail;
233 }
234
235 INIT_LIST_HEAD(&worker->pending);
236 INIT_LIST_HEAD(&worker->worker_list);
237 spin_lock_init(&worker->lock);
238 atomic_set(&worker->num_pending, 0);
239 worker->task = kthread_run(worker_loop, worker,
240 "btrfs-%s-%d", workers->name,
241 workers->num_workers + i);
242 worker->workers = workers;
243 if (IS_ERR(worker->task)) {
244 kfree(worker);
245 ret = PTR_ERR(worker->task);
246 goto fail;
247 }
248
249 spin_lock_irq(&workers->lock);
250 list_add_tail(&worker->worker_list, &workers->idle_list);
251 worker->idle = 1;
252 workers->num_workers++;
253 spin_unlock_irq(&workers->lock);
254 }
255 return 0;
256fail:
257 btrfs_stop_workers(workers);
258 return ret;
259}
260
261/*
262 * run through the list and find a worker thread that doesn't have a lot
263 * to do right now. This can return null if we aren't yet at the thread
264 * count limit and all of the threads are busy.
265 */
266static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
267{
268 struct btrfs_worker_thread *worker;
269 struct list_head *next;
270 int enforce_min = workers->num_workers < workers->max_workers;
271
272 /*
273 * if we find an idle thread, don't move it to the end of the
274 * idle list. This improves the chance that the next submission
275 * will reuse the same thread, and maybe catch it while it is still
276 * working
277 */
278 if (!list_empty(&workers->idle_list)) {
279 next = workers->idle_list.next;
280 worker = list_entry(next, struct btrfs_worker_thread,
281 worker_list);
282 return worker;
283 }
284 if (enforce_min || list_empty(&workers->worker_list))
285 return NULL;
286
287 /*
288 * if we pick a busy task, move the task to the end of the list.
289 * hopefully this will keep things somewhat evenly balanced.
290 * Do the move in batches based on the sequence number. This groups
291 * requests submitted at roughly the same time onto the same worker.
292 */
293 next = workers->worker_list.next;
294 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
295 atomic_inc(&worker->num_pending);
296 worker->sequence++;
297
298 if (worker->sequence % workers->idle_thresh == 0)
299 list_move_tail(next, &workers->worker_list);
300 return worker;
301}
302
303/*
304 * selects a worker thread to take the next job. This will either find
305 * an idle worker, start a new worker up to the max count, or just return
306 * one of the existing busy workers.
307 */
308static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
309{
310 struct btrfs_worker_thread *worker;
311 unsigned long flags;
312
313again:
314 spin_lock_irqsave(&workers->lock, flags);
315 worker = next_worker(workers);
316 spin_unlock_irqrestore(&workers->lock, flags);
317
318 if (!worker) {
319 spin_lock_irqsave(&workers->lock, flags);
320 if (workers->num_workers >= workers->max_workers) {
321 struct list_head *fallback = NULL;
322 /*
323 * we have failed to find any workers, just
324 * return the force one
325 */
326 if (!list_empty(&workers->worker_list))
327 fallback = workers->worker_list.next;
328 if (!list_empty(&workers->idle_list))
329 fallback = workers->idle_list.next;
330 BUG_ON(!fallback);
331 worker = list_entry(fallback,
332 struct btrfs_worker_thread, worker_list);
333 spin_unlock_irqrestore(&workers->lock, flags);
334 } else {
335 spin_unlock_irqrestore(&workers->lock, flags);
336 /* we're below the limit, start another worker */
337 btrfs_start_workers(workers, 1);
338 goto again;
339 }
340 }
341 return worker;
342}
343
344/*
345 * btrfs_requeue_work just puts the work item back on the tail of the list
346 * it was taken from. It is intended for use with long running work functions
347 * that make some progress and want to give the cpu up for others.
348 */
349int btrfs_requeue_work(struct btrfs_work *work)
350{
351 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags;
353
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out;
356
357 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending);
360
361 /* by definition we're busy, take ourselves off the idle
362 * list
363 */
364 if (worker->idle) {
365 spin_lock_irqsave(&worker->workers->lock, flags);
366 worker->idle = 0;
367 list_move_tail(&worker->worker_list,
368 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 }
371
372 spin_unlock_irqrestore(&worker->lock, flags);
373
374out:
375 return 0;
376}
377
378/*
379 * places a struct btrfs_work into the pending queue of one of the kthreads
380 */
381int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
382{
383 struct btrfs_worker_thread *worker;
384 unsigned long flags;
385 int wake = 0;
386
387 /* don't requeue something already on a list */
388 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
389 goto out;
390
391 worker = find_worker(workers);
392 if (workers->ordered) {
393 spin_lock_irqsave(&workers->lock, flags);
394 list_add_tail(&work->order_list, &workers->order_list);
395 spin_unlock_irqrestore(&workers->lock, flags);
396 } else {
397 INIT_LIST_HEAD(&work->order_list);
398 }
399
400 spin_lock_irqsave(&worker->lock, flags);
401 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404
405 /*
406 * avoid calling into wake_up_process if this thread has already
407 * been kicked
408 */
409 if (!worker->working)
410 wake = 1;
411 worker->working = 1;
412
413 spin_unlock_irqrestore(&worker->lock, flags);
414
415 if (wake)
416 wake_up_process(worker->task);
417out:
418 return 0;
419}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * func should be set to the function you want called
41 * your work struct is passed as the only arg
42 *
43 * ordered_func must be set for work sent to an ordered work queue,
44 * and it is called to complete a given work item in the same
45 * order they were sent to the queue.
46 */
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers;
69
70 /* once a worker has this many requests or fewer, it is idle */
71 int idle_thresh;
72
73 /* force completions in the order they were queued */
74 int ordered;
75
76 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above.
79 */
80 struct list_head worker_list;
81 struct list_head idle_list;
82
83 /*
84 * when operating in ordered mode, this maintains the list
85 * of work items waiting for completion
86 */
87 struct list_head order_list;
88
89 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock;
91
92 /* extra name for this worker, used for current->name */
93 char *name;
94};
95
96int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
97int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work);
101#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* key used to find this inode on disk. This is used by the code
32 * to read in roots of subvolumes
33 */
34 struct btrfs_key location;
35
36 /* the extent_tree has caches of all the extent mappings to disk */
37 struct extent_map_tree extent_tree;
38
39 /* the io_tree does range state (DIRTY, LOCKED etc) */
40 struct extent_io_tree io_tree;
41
42 /* special utility tree used to record which mirrors have already been
43 * tried when checksums fail for a given block
44 */
45 struct extent_io_tree io_failure_tree;
46
47 /* held while inesrting or deleting extents from files */
48 struct mutex extent_mutex;
49
50 /* held while logging the inode in tree-log.c */
51 struct mutex log_mutex;
52
53 /* used to order data wrt metadata */
54 struct btrfs_ordered_inode_tree ordered_tree;
55
56 /* standard acl pointers */
57 struct posix_acl *i_acl;
58 struct posix_acl *i_default_acl;
59
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all.
66 */
67 struct list_head delalloc_inodes;
68
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this.
71 */
72 u64 generation;
73
74 /* sequence number for NFS changes */
75 u64 sequence;
76
77 /*
78 * transid of the trans_handle that last modified this inode
79 */
80 u64 last_trans;
81 /*
82 * transid that last logged this inode
83 */
84 u64 logged_trans;
85
86 /*
87 * trans that last made a change that should be fully fsync'd. This
88 * gets reset to zero each time the inode is logged
89 */
90 u64 log_dirty_trans;
91
92 /* total number of bytes pending delalloc, used by stat to calc the
93 * real block usage of the file
94 */
95 u64 delalloc_bytes;
96
97 /*
98 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk
100 * because not all the blocks are written yet.
101 */
102 u64 disk_i_size;
103
104 /* flags field from the on disk inode */
105 u32 flags;
106
107 /*
108 * if this is a directory then index_cnt is the counter for the index
109 * number for new files that are created
110 */
111 u64 index_cnt;
112
113 /* the start of block group preferred for allocations. */
114 u64 block_group;
115
116 struct inode vfs_inode;
117};
118
119static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
120{
121 return container_of(inode, struct btrfs_inode, vfs_inode);
122}
123
124static inline void btrfs_i_size_write(struct inode *inode, u64 size)
125{
126 inode->i_size = size;
127 BTRFS_I(inode)->disk_i_size = size;
128}
129
130
131#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..7c4503ef6efd
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h>
37#include "compat.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "volumes.h"
43#include "ordered-data.h"
44#include "compression.h"
45#include "extent_io.h"
46#include "extent_map.h"
47
48struct compressed_bio {
49 /* number of bios pending for this compressed extent */
50 atomic_t pending_bios;
51
52 /* the pages with the compressed data on them */
53 struct page **compressed_pages;
54
55 /* inode that owns this data */
56 struct inode *inode;
57
58 /* starting offset in the inode for our pages */
59 u64 start;
60
61 /* number of bytes in the inode we're working on */
62 unsigned long len;
63
64 /* number of bytes on disk */
65 unsigned long compressed_len;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72 int mirror_num;
73
74 /* for reads, this is the bio we are copying the data into */
75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
82};
83
84static inline int compressed_bio_size(struct btrfs_root *root,
85 unsigned long disk_size)
86{
87 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
88 return sizeof(struct compressed_bio) +
89 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
90 csum_size;
91}
92
93static struct bio *compressed_bio_alloc(struct block_device *bdev,
94 u64 first_byte, gfp_t gfp_flags)
95{
96 struct bio *bio;
97 int nr_vecs;
98
99 nr_vecs = bio_get_nr_vecs(bdev);
100 bio = bio_alloc(gfp_flags, nr_vecs);
101
102 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
103 while (!bio && (nr_vecs /= 2))
104 bio = bio_alloc(gfp_flags, nr_vecs);
105 }
106
107 if (bio) {
108 bio->bi_size = 0;
109 bio->bi_bdev = bdev;
110 bio->bi_sector = first_byte >> 9;
111 }
112 return bio;
113}
114
115static int check_compressed_csum(struct inode *inode,
116 struct compressed_bio *cb,
117 u64 disk_start)
118{
119 int ret;
120 struct btrfs_root *root = BTRFS_I(inode)->root;
121 struct page *page;
122 unsigned long i;
123 char *kaddr;
124 u32 csum;
125 u32 *cb_sum = &cb->sums;
126
127 if (btrfs_test_flag(inode, NODATASUM))
128 return 0;
129
130 for (i = 0; i < cb->nr_pages; i++) {
131 page = cb->compressed_pages[i];
132 csum = ~(u32)0;
133
134 kaddr = kmap_atomic(page, KM_USER0);
135 csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
136 btrfs_csum_final(csum, (char *)&csum);
137 kunmap_atomic(kaddr, KM_USER0);
138
139 if (csum != *cb_sum) {
140 printk(KERN_INFO "btrfs csum failed ino %lu "
141 "extent %llu csum %u "
142 "wanted %u mirror %d\n", inode->i_ino,
143 (unsigned long long)disk_start,
144 csum, *cb_sum, cb->mirror_num);
145 ret = -EIO;
146 goto fail;
147 }
148 cb_sum++;
149
150 }
151 ret = 0;
152fail:
153 return ret;
154}
155
156/* when we finish reading compressed pages from the disk, we
157 * decompress them and then run the bio end_io routines on the
158 * decompressed pages (in the inode address space).
159 *
160 * This allows the checksumming and other IO error handling routines
161 * to work normally
162 *
163 * The compressed pages are freed here, and it must be run
164 * in process context
165 */
166static void end_compressed_bio_read(struct bio *bio, int err)
167{
168 struct extent_io_tree *tree;
169 struct compressed_bio *cb = bio->bi_private;
170 struct inode *inode;
171 struct page *page;
172 unsigned long index;
173 int ret;
174
175 if (err)
176 cb->errors = 1;
177
178 /* if there are more bios still pending for this compressed
179 * extent, just exit
180 */
181 if (!atomic_dec_and_test(&cb->pending_bios))
182 goto out;
183
184 inode = cb->inode;
185 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
186 if (ret)
187 goto csum_failed;
188
189 /* ok, we're the last bio for this extent, lets start
190 * the decompression.
191 */
192 tree = &BTRFS_I(inode)->io_tree;
193 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
194 cb->start,
195 cb->orig_bio->bi_io_vec,
196 cb->orig_bio->bi_vcnt,
197 cb->compressed_len);
198csum_failed:
199 if (ret)
200 cb->errors = 1;
201
202 /* release the compressed pages */
203 index = 0;
204 for (index = 0; index < cb->nr_pages; index++) {
205 page = cb->compressed_pages[index];
206 page->mapping = NULL;
207 page_cache_release(page);
208 }
209
210 /* do io completion on the original bio */
211 if (cb->errors) {
212 bio_io_error(cb->orig_bio);
213 } else {
214 int bio_index = 0;
215 struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
216
217 /*
218 * we have verified the checksum already, set page
219 * checked so the end_io handlers know about it
220 */
221 while (bio_index < cb->orig_bio->bi_vcnt) {
222 SetPageChecked(bvec->bv_page);
223 bvec++;
224 bio_index++;
225 }
226 bio_endio(cb->orig_bio, 0);
227 }
228
229 /* finally free the cb struct */
230 kfree(cb->compressed_pages);
231 kfree(cb);
232out:
233 bio_put(bio);
234}
235
236/*
237 * Clear the writeback bits on all of the file
238 * pages for a compressed write
239 */
240static noinline int end_compressed_writeback(struct inode *inode, u64 start,
241 unsigned long ram_size)
242{
243 unsigned long index = start >> PAGE_CACHE_SHIFT;
244 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
245 struct page *pages[16];
246 unsigned long nr_pages = end_index - index + 1;
247 int i;
248 int ret;
249
250 while (nr_pages > 0) {
251 ret = find_get_pages_contig(inode->i_mapping, index,
252 min_t(unsigned long,
253 nr_pages, ARRAY_SIZE(pages)), pages);
254 if (ret == 0) {
255 nr_pages -= 1;
256 index += 1;
257 continue;
258 }
259 for (i = 0; i < ret; i++) {
260 end_page_writeback(pages[i]);
261 page_cache_release(pages[i]);
262 }
263 nr_pages -= ret;
264 index += ret;
265 }
266 /* the inode may be gone now */
267 return 0;
268}
269
270/*
271 * do the cleanup once all the compressed pages hit the disk.
272 * This will clear writeback on the file pages and free the compressed
273 * pages.
274 *
275 * This also calls the writeback end hooks for the file pages so that
276 * metadata and checksums can be updated in the file.
277 */
278static void end_compressed_bio_write(struct bio *bio, int err)
279{
280 struct extent_io_tree *tree;
281 struct compressed_bio *cb = bio->bi_private;
282 struct inode *inode;
283 struct page *page;
284 unsigned long index;
285
286 if (err)
287 cb->errors = 1;
288
289 /* if there are more bios still pending for this compressed
290 * extent, just exit
291 */
292 if (!atomic_dec_and_test(&cb->pending_bios))
293 goto out;
294
295 /* ok, we're the last bio for this extent, step one is to
296 * call back into the FS and do all the end_io operations
297 */
298 inode = cb->inode;
299 tree = &BTRFS_I(inode)->io_tree;
300 cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
301 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
302 cb->start,
303 cb->start + cb->len - 1,
304 NULL, 1);
305 cb->compressed_pages[0]->mapping = NULL;
306
307 end_compressed_writeback(inode, cb->start, cb->len);
308 /* note, our inode could be gone now */
309
310 /*
311 * release the compressed pages, these came from alloc_page and
312 * are not attached to the inode at all
313 */
314 index = 0;
315 for (index = 0; index < cb->nr_pages; index++) {
316 page = cb->compressed_pages[index];
317 page->mapping = NULL;
318 page_cache_release(page);
319 }
320
321 /* finally free the cb struct */
322 kfree(cb->compressed_pages);
323 kfree(cb);
324out:
325 bio_put(bio);
326}
327
328/*
329 * worker function to build and submit bios for previously compressed pages.
330 * The corresponding pages in the inode should be marked for writeback
331 * and the compressed pages should have a reference on them for dropping
332 * when the IO is complete.
333 *
334 * This also checksums the file bytes and gets things ready for
335 * the end io hooks.
336 */
337int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 unsigned long len, u64 disk_start,
339 unsigned long compressed_len,
340 struct page **compressed_pages,
341 unsigned long nr_pages)
342{
343 struct bio *bio = NULL;
344 struct btrfs_root *root = BTRFS_I(inode)->root;
345 struct compressed_bio *cb;
346 unsigned long bytes_left;
347 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
348 int page_index = 0;
349 struct page *page;
350 u64 first_byte = disk_start;
351 struct block_device *bdev;
352 int ret;
353
354 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
355 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
356 atomic_set(&cb->pending_bios, 0);
357 cb->errors = 0;
358 cb->inode = inode;
359 cb->start = start;
360 cb->len = len;
361 cb->mirror_num = 0;
362 cb->compressed_pages = compressed_pages;
363 cb->compressed_len = compressed_len;
364 cb->orig_bio = NULL;
365 cb->nr_pages = nr_pages;
366
367 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
368
369 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
370 bio->bi_private = cb;
371 bio->bi_end_io = end_compressed_bio_write;
372 atomic_inc(&cb->pending_bios);
373
374 /* create and submit bios for the compressed pages */
375 bytes_left = compressed_len;
376 for (page_index = 0; page_index < cb->nr_pages; page_index++) {
377 page = compressed_pages[page_index];
378 page->mapping = inode->i_mapping;
379 if (bio->bi_size)
380 ret = io_tree->ops->merge_bio_hook(page, 0,
381 PAGE_CACHE_SIZE,
382 bio, 0);
383 else
384 ret = 0;
385
386 page->mapping = NULL;
387 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
388 PAGE_CACHE_SIZE) {
389 bio_get(bio);
390
391 /*
392 * inc the count before we submit the bio so
393 * we know the end IO handler won't happen before
394 * we inc the count. Otherwise, the cb might get
395 * freed before we're done setting it up
396 */
397 atomic_inc(&cb->pending_bios);
398 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
399 BUG_ON(ret);
400
401 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
402 BUG_ON(ret);
403
404 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
405 BUG_ON(ret);
406
407 bio_put(bio);
408
409 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
410 bio->bi_private = cb;
411 bio->bi_end_io = end_compressed_bio_write;
412 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
413 }
414 if (bytes_left < PAGE_CACHE_SIZE) {
415 printk("bytes left %lu compress len %lu nr %lu\n",
416 bytes_left, cb->compressed_len, cb->nr_pages);
417 }
418 bytes_left -= PAGE_CACHE_SIZE;
419 first_byte += PAGE_CACHE_SIZE;
420 cond_resched();
421 }
422 bio_get(bio);
423
424 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
425 BUG_ON(ret);
426
427 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
431 BUG_ON(ret);
432
433 bio_put(bio);
434 return 0;
435}
436
437static noinline int add_ra_bio_pages(struct inode *inode,
438 u64 compressed_end,
439 struct compressed_bio *cb)
440{
441 unsigned long end_index;
442 unsigned long page_index;
443 u64 last_offset;
444 u64 isize = i_size_read(inode);
445 int ret;
446 struct page *page;
447 unsigned long nr_pages = 0;
448 struct extent_map *em;
449 struct address_space *mapping = inode->i_mapping;
450 struct pagevec pvec;
451 struct extent_map_tree *em_tree;
452 struct extent_io_tree *tree;
453 u64 end;
454 int misses = 0;
455
456 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
457 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
458 em_tree = &BTRFS_I(inode)->extent_tree;
459 tree = &BTRFS_I(inode)->io_tree;
460
461 if (isize == 0)
462 return 0;
463
464 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
465
466 pagevec_init(&pvec, 0);
467 while (last_offset < compressed_end) {
468 page_index = last_offset >> PAGE_CACHE_SHIFT;
469
470 if (page_index > end_index)
471 break;
472
473 rcu_read_lock();
474 page = radix_tree_lookup(&mapping->page_tree, page_index);
475 rcu_read_unlock();
476 if (page) {
477 misses++;
478 if (misses > 4)
479 break;
480 goto next;
481 }
482
483 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
484 if (!page)
485 break;
486
487 page->index = page_index;
488 /*
489 * what we want to do here is call add_to_page_cache_lru,
490 * but that isn't exported, so we reproduce it here
491 */
492 if (add_to_page_cache(page, mapping,
493 page->index, GFP_NOFS)) {
494 page_cache_release(page);
495 goto next;
496 }
497
498 /* open coding of lru_cache_add, also not exported */
499 page_cache_get(page);
500 if (!pagevec_add(&pvec, page))
501 __pagevec_lru_add_file(&pvec);
502
503 end = last_offset + PAGE_CACHE_SIZE - 1;
504 /*
505 * at this point, we have a locked page in the page cache
506 * for these bytes in the file. But, we have to make
507 * sure they map to this compressed extent on disk.
508 */
509 set_page_extent_mapped(page);
510 lock_extent(tree, last_offset, end, GFP_NOFS);
511 spin_lock(&em_tree->lock);
512 em = lookup_extent_mapping(em_tree, last_offset,
513 PAGE_CACHE_SIZE);
514 spin_unlock(&em_tree->lock);
515
516 if (!em || last_offset < em->start ||
517 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
518 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
519 free_extent_map(em);
520 unlock_extent(tree, last_offset, end, GFP_NOFS);
521 unlock_page(page);
522 page_cache_release(page);
523 break;
524 }
525 free_extent_map(em);
526
527 if (page->index == end_index) {
528 char *userpage;
529 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
530
531 if (zero_offset) {
532 int zeros;
533 zeros = PAGE_CACHE_SIZE - zero_offset;
534 userpage = kmap_atomic(page, KM_USER0);
535 memset(userpage + zero_offset, 0, zeros);
536 flush_dcache_page(page);
537 kunmap_atomic(userpage, KM_USER0);
538 }
539 }
540
541 ret = bio_add_page(cb->orig_bio, page,
542 PAGE_CACHE_SIZE, 0);
543
544 if (ret == PAGE_CACHE_SIZE) {
545 nr_pages++;
546 page_cache_release(page);
547 } else {
548 unlock_extent(tree, last_offset, end, GFP_NOFS);
549 unlock_page(page);
550 page_cache_release(page);
551 break;
552 }
553next:
554 last_offset += PAGE_CACHE_SIZE;
555 }
556 if (pagevec_count(&pvec))
557 __pagevec_lru_add_file(&pvec);
558 return 0;
559}
560
561/*
562 * for a compressed read, the bio we get passed has all the inode pages
563 * in it. We don't actually do IO on those pages but allocate new ones
564 * to hold the compressed pages on disk.
565 *
566 * bio->bi_sector points to the compressed extent on disk
567 * bio->bi_io_vec points to all of the inode pages
568 * bio->bi_vcnt is a count of pages
569 *
570 * After the compressed pages are read, we copy the bytes into the
571 * bio we were passed and then call the bio end_io calls
572 */
573int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
574 int mirror_num, unsigned long bio_flags)
575{
576 struct extent_io_tree *tree;
577 struct extent_map_tree *em_tree;
578 struct compressed_bio *cb;
579 struct btrfs_root *root = BTRFS_I(inode)->root;
580 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
581 unsigned long compressed_len;
582 unsigned long nr_pages;
583 unsigned long page_index;
584 struct page *page;
585 struct block_device *bdev;
586 struct bio *comp_bio;
587 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
588 u64 em_len;
589 u64 em_start;
590 struct extent_map *em;
591 int ret;
592 u32 *sums;
593
594 tree = &BTRFS_I(inode)->io_tree;
595 em_tree = &BTRFS_I(inode)->extent_tree;
596
597 /* we need the actual starting offset of this extent in the file */
598 spin_lock(&em_tree->lock);
599 em = lookup_extent_mapping(em_tree,
600 page_offset(bio->bi_io_vec->bv_page),
601 PAGE_CACHE_SIZE);
602 spin_unlock(&em_tree->lock);
603
604 compressed_len = em->block_len;
605 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
606 atomic_set(&cb->pending_bios, 0);
607 cb->errors = 0;
608 cb->inode = inode;
609 cb->mirror_num = mirror_num;
610 sums = &cb->sums;
611
612 cb->start = em->orig_start;
613 em_len = em->len;
614 em_start = em->start;
615
616 free_extent_map(em);
617 em = NULL;
618
619 cb->len = uncompressed_len;
620 cb->compressed_len = compressed_len;
621 cb->orig_bio = bio;
622
623 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
624 PAGE_CACHE_SIZE;
625 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
626 GFP_NOFS);
627 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
628
629 for (page_index = 0; page_index < nr_pages; page_index++) {
630 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
631 __GFP_HIGHMEM);
632 }
633 cb->nr_pages = nr_pages;
634
635 add_ra_bio_pages(inode, em_start + em_len, cb);
636
637 /* include any pages we added in add_ra-bio_pages */
638 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
639 cb->len = uncompressed_len;
640
641 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
642 comp_bio->bi_private = cb;
643 comp_bio->bi_end_io = end_compressed_bio_read;
644 atomic_inc(&cb->pending_bios);
645
646 for (page_index = 0; page_index < nr_pages; page_index++) {
647 page = cb->compressed_pages[page_index];
648 page->mapping = inode->i_mapping;
649 page->index = em_start >> PAGE_CACHE_SHIFT;
650
651 if (comp_bio->bi_size)
652 ret = tree->ops->merge_bio_hook(page, 0,
653 PAGE_CACHE_SIZE,
654 comp_bio, 0);
655 else
656 ret = 0;
657
658 page->mapping = NULL;
659 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
660 PAGE_CACHE_SIZE) {
661 bio_get(comp_bio);
662
663 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
664 BUG_ON(ret);
665
666 /*
667 * inc the count before we submit the bio so
668 * we know the end IO handler won't happen before
669 * we inc the count. Otherwise, the cb might get
670 * freed before we're done setting it up
671 */
672 atomic_inc(&cb->pending_bios);
673
674 if (!btrfs_test_flag(inode, NODATASUM)) {
675 btrfs_lookup_bio_sums(root, inode, comp_bio,
676 sums);
677 }
678 sums += (comp_bio->bi_size + root->sectorsize - 1) /
679 root->sectorsize;
680
681 ret = btrfs_map_bio(root, READ, comp_bio,
682 mirror_num, 0);
683 BUG_ON(ret);
684
685 bio_put(comp_bio);
686
687 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
688 GFP_NOFS);
689 comp_bio->bi_private = cb;
690 comp_bio->bi_end_io = end_compressed_bio_read;
691
692 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
693 }
694 cur_disk_byte += PAGE_CACHE_SIZE;
695 }
696 bio_get(comp_bio);
697
698 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
699 BUG_ON(ret);
700
701 if (!btrfs_test_flag(inode, NODATASUM))
702 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
703
704 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
705 BUG_ON(ret);
706
707 bio_put(comp_bio);
708 return 0;
709}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <linux/crc32c.h>
22
23/*
24 * this file used to do more for selecting the HW version of crc32c,
25 * perhaps it will one day again soon.
26 */
27#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
28#endif
29
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while (1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 write_extent_buffer(cow, root->fs_info->fsid,
189 (unsigned long)btrfs_header_fsid(cow),
190 BTRFS_FSID_SIZE);
191
192 WARN_ON(btrfs_header_generation(buf) > trans->transid);
193 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 kfree(new_root);
195
196 if (ret)
197 return ret;
198
199 btrfs_mark_buffer_dirty(cow);
200 *cow_ret = cow;
201 return 0;
202}
203
204/*
205 * does the dirty work in cow of a single block. The parent block (if
206 * supplied) is updated to point to the new cow copy. The new buffer is marked
207 * dirty and returned locked. If you modify the block it needs to be marked
208 * dirty again.
209 *
210 * search_start -- an allocation hint for the new block
211 *
212 * empty_size -- a hint that you plan on doing more cow. This is the size in
213 * bytes the allocator should try to find free next to the block it returns.
214 * This is just a hint and may be ignored by the allocator.
215 *
216 * prealloc_dest -- if you have already reserved a destination for the cow,
217 * this uses that block instead of allocating a new one.
218 * btrfs_alloc_reserved_extent is used to finish the allocation.
219 */
220static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 struct btrfs_root *root,
222 struct extent_buffer *buf,
223 struct extent_buffer *parent, int parent_slot,
224 struct extent_buffer **cow_ret,
225 u64 search_start, u64 empty_size,
226 u64 prealloc_dest)
227{
228 u64 parent_start;
229 struct extent_buffer *cow;
230 u32 nritems;
231 int ret = 0;
232 int level;
233 int unlock_orig = 0;
234
235 if (*cow_ret == buf)
236 unlock_orig = 1;
237
238 WARN_ON(!btrfs_tree_locked(buf));
239
240 if (parent)
241 parent_start = parent->start;
242 else
243 parent_start = 0;
244
245 WARN_ON(root->ref_cows && trans->transid !=
246 root->fs_info->running_transaction->transid);
247 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249 level = btrfs_header_level(buf);
250 nritems = btrfs_header_nritems(buf);
251
252 if (prealloc_dest) {
253 struct btrfs_key ins;
254
255 ins.objectid = prealloc_dest;
256 ins.offset = buf->len;
257 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260 root->root_key.objectid,
261 trans->transid, level, &ins);
262 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len);
265 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start,
268 root->root_key.objectid,
269 trans->transid, level,
270 search_start, empty_size);
271 }
272 if (IS_ERR(cow))
273 return PTR_ERR(cow);
274
275 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid);
278 btrfs_set_header_owner(cow, root->root_key.objectid);
279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281 write_extent_buffer(cow, root->fs_info->fsid,
282 (unsigned long)btrfs_header_fsid(cow),
283 BTRFS_FSID_SIZE);
284
285 WARN_ON(btrfs_header_generation(buf) > trans->transid);
286 if (btrfs_header_generation(buf) != trans->transid) {
287 u32 nr_extents;
288 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 if (ret)
290 return ret;
291
292 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293 WARN_ON(ret);
294 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295 /*
296 * There are only two places that can drop reference to
297 * tree blocks owned by living reloc trees, one is here,
298 * the other place is btrfs_drop_subtree. In both places,
299 * we check reference count while tree block is locked.
300 * Furthermore, if reference count is one, it won't get
301 * increased by someone else.
302 */
303 u32 refs;
304 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305 buf->len, &refs);
306 BUG_ON(ret);
307 if (refs == 1) {
308 ret = btrfs_update_ref(trans, root, buf, cow,
309 0, nritems);
310 clean_tree_block(trans, root, buf);
311 } else {
312 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313 }
314 BUG_ON(ret);
315 } else {
316 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317 if (ret)
318 return ret;
319 clean_tree_block(trans, root, buf);
320 }
321
322 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324 WARN_ON(ret);
325 }
326
327 if (buf == root->node) {
328 WARN_ON(parent && parent != buf);
329
330 spin_lock(&root->node_lock);
331 root->node = cow;
332 extent_buffer_get(cow);
333 spin_unlock(&root->node_lock);
334
335 if (buf != root->commit_root) {
336 btrfs_free_extent(trans, root, buf->start,
337 buf->len, buf->start,
338 root->root_key.objectid,
339 btrfs_header_generation(buf),
340 level, 1);
341 }
342 free_extent_buffer(buf);
343 add_root_to_dirty_list(root);
344 } else {
345 btrfs_set_node_blockptr(parent, parent_slot,
346 cow->start);
347 WARN_ON(trans->transid == 0);
348 btrfs_set_node_ptr_generation(parent, parent_slot,
349 trans->transid);
350 btrfs_mark_buffer_dirty(parent);
351 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352 btrfs_free_extent(trans, root, buf->start, buf->len,
353 parent_start, btrfs_header_owner(parent),
354 btrfs_header_generation(parent), level, 1);
355 }
356 if (unlock_orig)
357 btrfs_tree_unlock(buf);
358 free_extent_buffer(buf);
359 btrfs_mark_buffer_dirty(cow);
360 *cow_ret = cow;
361 return 0;
362}
363
364/*
365 * cows a single block, see __btrfs_cow_block for the real work.
366 * This version of it has extra checks so that a block isn't cow'd more than
367 * once per transaction, as long as it hasn't been written yet
368 */
369noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 struct btrfs_root *root, struct extent_buffer *buf,
371 struct extent_buffer *parent, int parent_slot,
372 struct extent_buffer **cow_ret, u64 prealloc_dest)
373{
374 u64 search_start;
375 int ret;
376
377 if (trans->transaction != root->fs_info->running_transaction) {
378 printk(KERN_CRIT "trans %llu running %llu\n",
379 (unsigned long long)trans->transid,
380 (unsigned long long)
381 root->fs_info->running_transaction->transid);
382 WARN_ON(1);
383 }
384 if (trans->transid != root->fs_info->generation) {
385 printk(KERN_CRIT "trans %llu running %llu\n",
386 (unsigned long long)trans->transid,
387 (unsigned long long)root->fs_info->generation);
388 WARN_ON(1);
389 }
390
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest);
398 return 0;
399 }
400 spin_unlock(&root->fs_info->hash_lock);
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest);
405 return ret;
406}
407
408/*
409 * helper function for defrag to decide if two blocks pointed to by a
410 * node are actually close by
411 */
412static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413{
414 if (blocknr < other && other - (blocknr + blocksize) < 32768)
415 return 1;
416 if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 return 1;
418 return 0;
419}
420
421/*
422 * compare two keys in a memcmp fashion
423 */
424static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
425{
426 struct btrfs_key k1;
427
428 btrfs_disk_key_to_cpu(&k1, disk);
429
430 if (k1.objectid > k2->objectid)
431 return 1;
432 if (k1.objectid < k2->objectid)
433 return -1;
434 if (k1.type > k2->type)
435 return 1;
436 if (k1.type < k2->type)
437 return -1;
438 if (k1.offset > k2->offset)
439 return 1;
440 if (k1.offset < k2->offset)
441 return -1;
442 return 0;
443}
444
445/*
446 * same as comp_keys only with two btrfs_key's
447 */
448static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
449{
450 if (k1->objectid > k2->objectid)
451 return 1;
452 if (k1->objectid < k2->objectid)
453 return -1;
454 if (k1->type > k2->type)
455 return 1;
456 if (k1->type < k2->type)
457 return -1;
458 if (k1->offset > k2->offset)
459 return 1;
460 if (k1->offset < k2->offset)
461 return -1;
462 return 0;
463}
464
465/*
466 * this is used by the defrag code to go through all the
467 * leaves pointed to by a node and reallocate them so that
468 * disk order is close to key order
469 */
470int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471 struct btrfs_root *root, struct extent_buffer *parent,
472 int start_slot, int cache_only, u64 *last_ret,
473 struct btrfs_key *progress)
474{
475 struct extent_buffer *cur;
476 u64 blocknr;
477 u64 gen;
478 u64 search_start = *last_ret;
479 u64 last_block = 0;
480 u64 other;
481 u32 parent_nritems;
482 int end_slot;
483 int i;
484 int err = 0;
485 int parent_level;
486 int uptodate;
487 u32 blocksize;
488 int progress_passed = 0;
489 struct btrfs_disk_key disk_key;
490
491 parent_level = btrfs_header_level(parent);
492 if (cache_only && parent_level != 1)
493 return 0;
494
495 if (trans->transaction != root->fs_info->running_transaction)
496 WARN_ON(1);
497 if (trans->transid != root->fs_info->generation)
498 WARN_ON(1);
499
500 parent_nritems = btrfs_header_nritems(parent);
501 blocksize = btrfs_level_size(root, parent_level - 1);
502 end_slot = parent_nritems;
503
504 if (parent_nritems == 1)
505 return 0;
506
507 for (i = start_slot; i < end_slot; i++) {
508 int close = 1;
509
510 if (!parent->map_token) {
511 map_extent_buffer(parent,
512 btrfs_node_key_ptr_offset(i),
513 sizeof(struct btrfs_key_ptr),
514 &parent->map_token, &parent->kaddr,
515 &parent->map_start, &parent->map_len,
516 KM_USER1);
517 }
518 btrfs_node_key(parent, &disk_key, i);
519 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
520 continue;
521
522 progress_passed = 1;
523 blocknr = btrfs_node_blockptr(parent, i);
524 gen = btrfs_node_ptr_generation(parent, i);
525 if (last_block == 0)
526 last_block = blocknr;
527
528 if (i > 0) {
529 other = btrfs_node_blockptr(parent, i - 1);
530 close = close_blocks(blocknr, other, blocksize);
531 }
532 if (!close && i < end_slot - 2) {
533 other = btrfs_node_blockptr(parent, i + 1);
534 close = close_blocks(blocknr, other, blocksize);
535 }
536 if (close) {
537 last_block = blocknr;
538 continue;
539 }
540 if (parent->map_token) {
541 unmap_extent_buffer(parent, parent->map_token,
542 KM_USER1);
543 parent->map_token = NULL;
544 }
545
546 cur = btrfs_find_tree_block(root, blocknr, blocksize);
547 if (cur)
548 uptodate = btrfs_buffer_uptodate(cur, gen);
549 else
550 uptodate = 0;
551 if (!cur || !uptodate) {
552 if (cache_only) {
553 free_extent_buffer(cur);
554 continue;
555 }
556 if (!cur) {
557 cur = read_tree_block(root, blocknr,
558 blocksize, gen);
559 } else if (!uptodate) {
560 btrfs_read_buffer(cur, gen);
561 }
562 }
563 if (search_start == 0)
564 search_start = last_block;
565
566 btrfs_tree_lock(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start,
569 min(16 * blocksize,
570 (end_slot - i) * blocksize), 0);
571 if (err) {
572 btrfs_tree_unlock(cur);
573 free_extent_buffer(cur);
574 break;
575 }
576 search_start = cur->start;
577 last_block = cur->start;
578 *last_ret = search_start;
579 btrfs_tree_unlock(cur);
580 free_extent_buffer(cur);
581 }
582 if (parent->map_token) {
583 unmap_extent_buffer(parent, parent->map_token,
584 KM_USER1);
585 parent->map_token = NULL;
586 }
587 return err;
588}
589
590/*
591 * The leaf data grows from end-to-front in the node.
592 * this returns the address of the start of the last item,
593 * which is the stop of the leaf data stack
594 */
595static inline unsigned int leaf_data_end(struct btrfs_root *root,
596 struct extent_buffer *leaf)
597{
598 u32 nr = btrfs_header_nritems(leaf);
599 if (nr == 0)
600 return BTRFS_LEAF_DATA_SIZE(root);
601 return btrfs_item_offset_nr(leaf, nr - 1);
602}
603
604/*
605 * extra debugging checks to make sure all the items in a key are
606 * well formed and in the proper order
607 */
608static int check_node(struct btrfs_root *root, struct btrfs_path *path,
609 int level)
610{
611 struct extent_buffer *parent = NULL;
612 struct extent_buffer *node = path->nodes[level];
613 struct btrfs_disk_key parent_key;
614 struct btrfs_disk_key node_key;
615 int parent_slot;
616 int slot;
617 struct btrfs_key cpukey;
618 u32 nritems = btrfs_header_nritems(node);
619
620 if (path->nodes[level + 1])
621 parent = path->nodes[level + 1];
622
623 slot = path->slots[level];
624 BUG_ON(nritems == 0);
625 if (parent) {
626 parent_slot = path->slots[level + 1];
627 btrfs_node_key(parent, &parent_key, parent_slot);
628 btrfs_node_key(node, &node_key, 0);
629 BUG_ON(memcmp(&parent_key, &node_key,
630 sizeof(struct btrfs_disk_key)));
631 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632 btrfs_header_bytenr(node));
633 }
634 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635 if (slot != 0) {
636 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
637 btrfs_node_key(node, &node_key, slot);
638 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 }
640 if (slot < nritems - 1) {
641 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
642 btrfs_node_key(node, &node_key, slot);
643 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
644 }
645 return 0;
646}
647
648/*
649 * extra checking to make sure all the items in a leaf are
650 * well formed and in the proper order
651 */
652static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
653 int level)
654{
655 struct extent_buffer *leaf = path->nodes[level];
656 struct extent_buffer *parent = NULL;
657 int parent_slot;
658 struct btrfs_key cpukey;
659 struct btrfs_disk_key parent_key;
660 struct btrfs_disk_key leaf_key;
661 int slot = path->slots[0];
662
663 u32 nritems = btrfs_header_nritems(leaf);
664
665 if (path->nodes[level + 1])
666 parent = path->nodes[level + 1];
667
668 if (nritems == 0)
669 return 0;
670
671 if (parent) {
672 parent_slot = path->slots[level + 1];
673 btrfs_node_key(parent, &parent_key, parent_slot);
674 btrfs_item_key(leaf, &leaf_key, 0);
675
676 BUG_ON(memcmp(&parent_key, &leaf_key,
677 sizeof(struct btrfs_disk_key)));
678 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679 btrfs_header_bytenr(leaf));
680 }
681 if (slot != 0 && slot < nritems - 1) {
682 btrfs_item_key(leaf, &leaf_key, slot);
683 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
684 if (comp_keys(&leaf_key, &cpukey) <= 0) {
685 btrfs_print_leaf(root, leaf);
686 printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 BUG_ON(1);
688 }
689 if (btrfs_item_offset_nr(leaf, slot - 1) !=
690 btrfs_item_end_nr(leaf, slot)) {
691 btrfs_print_leaf(root, leaf);
692 printk(KERN_CRIT "slot %d offset bad\n", slot);
693 BUG_ON(1);
694 }
695 }
696 if (slot < nritems - 1) {
697 btrfs_item_key(leaf, &leaf_key, slot);
698 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
699 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
700 if (btrfs_item_offset_nr(leaf, slot) !=
701 btrfs_item_end_nr(leaf, slot + 1)) {
702 btrfs_print_leaf(root, leaf);
703 printk(KERN_CRIT "slot %d offset bad\n", slot);
704 BUG_ON(1);
705 }
706 }
707 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
708 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
709 return 0;
710}
711
712static noinline int check_block(struct btrfs_root *root,
713 struct btrfs_path *path, int level)
714{
715 return 0;
716 if (level == 0)
717 return check_leaf(root, path, level);
718 return check_node(root, path, level);
719}
720
721/*
722 * search for key in the extent_buffer. The items start at offset p,
723 * and they are item_size apart. There are 'max' items in p.
724 *
725 * the slot in the array is returned via slot, and it points to
726 * the place where you would insert key if it is not found in
727 * the array.
728 *
729 * slot may point to max if the key is bigger than all of the keys
730 */
731static noinline int generic_bin_search(struct extent_buffer *eb,
732 unsigned long p,
733 int item_size, struct btrfs_key *key,
734 int max, int *slot)
735{
736 int low = 0;
737 int high = max;
738 int mid;
739 int ret;
740 struct btrfs_disk_key *tmp = NULL;
741 struct btrfs_disk_key unaligned;
742 unsigned long offset;
743 char *map_token = NULL;
744 char *kaddr = NULL;
745 unsigned long map_start = 0;
746 unsigned long map_len = 0;
747 int err;
748
749 while (low < high) {
750 mid = (low + high) / 2;
751 offset = p + mid * item_size;
752
753 if (!map_token || offset < map_start ||
754 (offset + sizeof(struct btrfs_disk_key)) >
755 map_start + map_len) {
756 if (map_token) {
757 unmap_extent_buffer(eb, map_token, KM_USER0);
758 map_token = NULL;
759 }
760
761 err = map_private_extent_buffer(eb, offset,
762 sizeof(struct btrfs_disk_key),
763 &map_token, &kaddr,
764 &map_start, &map_len, KM_USER0);
765
766 if (!err) {
767 tmp = (struct btrfs_disk_key *)(kaddr + offset -
768 map_start);
769 } else {
770 read_extent_buffer(eb, &unaligned,
771 offset, sizeof(unaligned));
772 tmp = &unaligned;
773 }
774
775 } else {
776 tmp = (struct btrfs_disk_key *)(kaddr + offset -
777 map_start);
778 }
779 ret = comp_keys(tmp, key);
780
781 if (ret < 0)
782 low = mid + 1;
783 else if (ret > 0)
784 high = mid;
785 else {
786 *slot = mid;
787 if (map_token)
788 unmap_extent_buffer(eb, map_token, KM_USER0);
789 return 0;
790 }
791 }
792 *slot = low;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 1;
796}
797
798/*
799 * simple bin_search frontend that does the right thing for
800 * leaves vs nodes
801 */
802static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
803 int level, int *slot)
804{
805 if (level == 0) {
806 return generic_bin_search(eb,
807 offsetof(struct btrfs_leaf, items),
808 sizeof(struct btrfs_item),
809 key, btrfs_header_nritems(eb),
810 slot);
811 } else {
812 return generic_bin_search(eb,
813 offsetof(struct btrfs_node, ptrs),
814 sizeof(struct btrfs_key_ptr),
815 key, btrfs_header_nritems(eb),
816 slot);
817 }
818 return -1;
819}
820
821/* given a node and slot number, this reads the blocks it points to. The
822 * extent buffer is returned with a reference taken (but unlocked).
823 * NULL is returned on error.
824 */
825static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826 struct extent_buffer *parent, int slot)
827{
828 int level = btrfs_header_level(parent);
829 if (slot < 0)
830 return NULL;
831 if (slot >= btrfs_header_nritems(parent))
832 return NULL;
833
834 BUG_ON(level == 0);
835
836 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 btrfs_level_size(root, level - 1),
838 btrfs_node_ptr_generation(parent, slot));
839}
840
841/*
842 * node level balancing, used to make sure nodes are in proper order for
843 * item deletion. We balance from the top down, so we have to make sure
844 * that a deletion won't leave an node completely empty later on.
845 */
846static noinline int balance_level(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_path *path, int level)
849{
850 struct extent_buffer *right = NULL;
851 struct extent_buffer *mid;
852 struct extent_buffer *left = NULL;
853 struct extent_buffer *parent = NULL;
854 int ret = 0;
855 int wret;
856 int pslot;
857 int orig_slot = path->slots[level];
858 int err_on_enospc = 0;
859 u64 orig_ptr;
860
861 if (level == 0)
862 return 0;
863
864 mid = path->nodes[level];
865 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869
870 if (level < BTRFS_MAX_LEVEL - 1)
871 parent = path->nodes[level + 1];
872 pslot = path->slots[level + 1];
873
874 /*
875 * deal with the case where there is only one pointer in the root
876 * by promoting the node below to a root
877 */
878 if (!parent) {
879 struct extent_buffer *child;
880
881 if (btrfs_header_nritems(mid) != 1)
882 return 0;
883
884 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret);
890
891 spin_lock(&root->node_lock);
892 root->node = child;
893 spin_unlock(&root->node_lock);
894
895 ret = btrfs_update_extent_ref(trans, root, child->start,
896 mid->start, child->start,
897 root->root_key.objectid,
898 trans->transid, level - 1);
899 BUG_ON(ret);
900
901 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child);
903 path->locks[level] = 0;
904 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid);
906 btrfs_tree_unlock(mid);
907 /* once for the path */
908 free_extent_buffer(mid);
909 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
910 mid->start, root->root_key.objectid,
911 btrfs_header_generation(mid),
912 level, 1);
913 /* once for the root ptr */
914 free_extent_buffer(mid);
915 return ret;
916 }
917 if (btrfs_header_nritems(mid) >
918 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 return 0;
920
921 if (btrfs_header_nritems(mid) < 2)
922 err_on_enospc = 1;
923
924 left = read_node_slot(root, parent, pslot - 1);
925 if (left) {
926 btrfs_tree_lock(left);
927 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0);
929 if (wret) {
930 ret = wret;
931 goto enospc;
932 }
933 }
934 right = read_node_slot(root, parent, pslot + 1);
935 if (right) {
936 btrfs_tree_lock(right);
937 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0);
939 if (wret) {
940 ret = wret;
941 goto enospc;
942 }
943 }
944
945 /* first, try to make some room in the middle buffer */
946 if (left) {
947 orig_slot += btrfs_header_nritems(left);
948 wret = push_node_left(trans, root, left, mid, 1);
949 if (wret < 0)
950 ret = wret;
951 if (btrfs_header_nritems(mid) < 2)
952 err_on_enospc = 1;
953 }
954
955 /*
956 * then try to empty the right most buffer into the middle
957 */
958 if (right) {
959 wret = push_node_left(trans, root, mid, right, 1);
960 if (wret < 0 && wret != -ENOSPC)
961 ret = wret;
962 if (btrfs_header_nritems(right) == 0) {
963 u64 bytenr = right->start;
964 u64 generation = btrfs_header_generation(parent);
965 u32 blocksize = right->len;
966
967 clean_tree_block(trans, root, right);
968 btrfs_tree_unlock(right);
969 free_extent_buffer(right);
970 right = NULL;
971 wret = del_ptr(trans, root, path, level + 1, pslot +
972 1);
973 if (wret)
974 ret = wret;
975 wret = btrfs_free_extent(trans, root, bytenr,
976 blocksize, parent->start,
977 btrfs_header_owner(parent),
978 generation, level, 1);
979 if (wret)
980 ret = wret;
981 } else {
982 struct btrfs_disk_key right_key;
983 btrfs_node_key(right, &right_key, 0);
984 btrfs_set_node_key(parent, &right_key, pslot + 1);
985 btrfs_mark_buffer_dirty(parent);
986 }
987 }
988 if (btrfs_header_nritems(mid) == 1) {
989 /*
990 * we're not allowed to leave a node with one item in the
991 * tree during a delete. A deletion from lower in the tree
992 * could try to delete the only pointer in this node.
993 * So, pull some keys from the left.
994 * There has to be a left pointer at this point because
995 * otherwise we would have pulled some pointers from the
996 * right
997 */
998 BUG_ON(!left);
999 wret = balance_node_right(trans, root, mid, left);
1000 if (wret < 0) {
1001 ret = wret;
1002 goto enospc;
1003 }
1004 if (wret == 1) {
1005 wret = push_node_left(trans, root, left, mid, 1);
1006 if (wret < 0)
1007 ret = wret;
1008 }
1009 BUG_ON(wret == 1);
1010 }
1011 if (btrfs_header_nritems(mid) == 0) {
1012 /* we've managed to empty the middle node, drop it */
1013 u64 root_gen = btrfs_header_generation(parent);
1014 u64 bytenr = mid->start;
1015 u32 blocksize = mid->len;
1016
1017 clean_tree_block(trans, root, mid);
1018 btrfs_tree_unlock(mid);
1019 free_extent_buffer(mid);
1020 mid = NULL;
1021 wret = del_ptr(trans, root, path, level + 1, pslot);
1022 if (wret)
1023 ret = wret;
1024 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1025 parent->start,
1026 btrfs_header_owner(parent),
1027 root_gen, level, 1);
1028 if (wret)
1029 ret = wret;
1030 } else {
1031 /* update the parent key to reflect our changes */
1032 struct btrfs_disk_key mid_key;
1033 btrfs_node_key(mid, &mid_key, 0);
1034 btrfs_set_node_key(parent, &mid_key, pslot);
1035 btrfs_mark_buffer_dirty(parent);
1036 }
1037
1038 /* update the path */
1039 if (left) {
1040 if (btrfs_header_nritems(left) > orig_slot) {
1041 extent_buffer_get(left);
1042 /* left was locked after cow */
1043 path->nodes[level] = left;
1044 path->slots[level + 1] -= 1;
1045 path->slots[level] = orig_slot;
1046 if (mid) {
1047 btrfs_tree_unlock(mid);
1048 free_extent_buffer(mid);
1049 }
1050 } else {
1051 orig_slot -= btrfs_header_nritems(left);
1052 path->slots[level] = orig_slot;
1053 }
1054 }
1055 /* double check we haven't messed things up */
1056 check_block(root, path, level);
1057 if (orig_ptr !=
1058 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059 BUG();
1060enospc:
1061 if (right) {
1062 btrfs_tree_unlock(right);
1063 free_extent_buffer(right);
1064 }
1065 if (left) {
1066 if (path->nodes[level] != left)
1067 btrfs_tree_unlock(left);
1068 free_extent_buffer(left);
1069 }
1070 return ret;
1071}
1072
1073/* Node balancing for insertion. Here we only split or push nodes around
1074 * when they are completely full. This is also done top down, so we
1075 * have to be pessimistic.
1076 */
1077static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path, int level)
1080{
1081 struct extent_buffer *right = NULL;
1082 struct extent_buffer *mid;
1083 struct extent_buffer *left = NULL;
1084 struct extent_buffer *parent = NULL;
1085 int ret = 0;
1086 int wret;
1087 int pslot;
1088 int orig_slot = path->slots[level];
1089 u64 orig_ptr;
1090
1091 if (level == 0)
1092 return 1;
1093
1094 mid = path->nodes[level];
1095 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1097
1098 if (level < BTRFS_MAX_LEVEL - 1)
1099 parent = path->nodes[level + 1];
1100 pslot = path->slots[level + 1];
1101
1102 if (!parent)
1103 return 1;
1104
1105 left = read_node_slot(root, parent, pslot - 1);
1106
1107 /* first, try to make some room in the middle buffer */
1108 if (left) {
1109 u32 left_nr;
1110
1111 btrfs_tree_lock(left);
1112 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1;
1115 } else {
1116 ret = btrfs_cow_block(trans, root, left, parent,
1117 pslot - 1, &left, 0);
1118 if (ret)
1119 wret = 1;
1120 else {
1121 wret = push_node_left(trans, root,
1122 left, mid, 0);
1123 }
1124 }
1125 if (wret < 0)
1126 ret = wret;
1127 if (wret == 0) {
1128 struct btrfs_disk_key disk_key;
1129 orig_slot += left_nr;
1130 btrfs_node_key(mid, &disk_key, 0);
1131 btrfs_set_node_key(parent, &disk_key, pslot);
1132 btrfs_mark_buffer_dirty(parent);
1133 if (btrfs_header_nritems(left) > orig_slot) {
1134 path->nodes[level] = left;
1135 path->slots[level + 1] -= 1;
1136 path->slots[level] = orig_slot;
1137 btrfs_tree_unlock(mid);
1138 free_extent_buffer(mid);
1139 } else {
1140 orig_slot -=
1141 btrfs_header_nritems(left);
1142 path->slots[level] = orig_slot;
1143 btrfs_tree_unlock(left);
1144 free_extent_buffer(left);
1145 }
1146 return 0;
1147 }
1148 btrfs_tree_unlock(left);
1149 free_extent_buffer(left);
1150 }
1151 right = read_node_slot(root, parent, pslot + 1);
1152
1153 /*
1154 * then try to empty the right most buffer into the middle
1155 */
1156 if (right) {
1157 u32 right_nr;
1158 btrfs_tree_lock(right);
1159 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1;
1162 } else {
1163 ret = btrfs_cow_block(trans, root, right,
1164 parent, pslot + 1,
1165 &right, 0);
1166 if (ret)
1167 wret = 1;
1168 else {
1169 wret = balance_node_right(trans, root,
1170 right, mid);
1171 }
1172 }
1173 if (wret < 0)
1174 ret = wret;
1175 if (wret == 0) {
1176 struct btrfs_disk_key disk_key;
1177
1178 btrfs_node_key(right, &disk_key, 0);
1179 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1180 btrfs_mark_buffer_dirty(parent);
1181
1182 if (btrfs_header_nritems(mid) <= orig_slot) {
1183 path->nodes[level] = right;
1184 path->slots[level + 1] += 1;
1185 path->slots[level] = orig_slot -
1186 btrfs_header_nritems(mid);
1187 btrfs_tree_unlock(mid);
1188 free_extent_buffer(mid);
1189 } else {
1190 btrfs_tree_unlock(right);
1191 free_extent_buffer(right);
1192 }
1193 return 0;
1194 }
1195 btrfs_tree_unlock(right);
1196 free_extent_buffer(right);
1197 }
1198 return 1;
1199}
1200
1201/*
1202 * readahead one full node of leaves, finding things that are close
1203 * to the block in 'slot', and triggering ra on them.
1204 */
1205static noinline void reada_for_search(struct btrfs_root *root,
1206 struct btrfs_path *path,
1207 int level, int slot, u64 objectid)
1208{
1209 struct extent_buffer *node;
1210 struct btrfs_disk_key disk_key;
1211 u32 nritems;
1212 u64 search;
1213 u64 lowest_read;
1214 u64 highest_read;
1215 u64 nread = 0;
1216 int direction = path->reada;
1217 struct extent_buffer *eb;
1218 u32 nr;
1219 u32 blocksize;
1220 u32 nscan = 0;
1221
1222 if (level != 1)
1223 return;
1224
1225 if (!path->nodes[level])
1226 return;
1227
1228 node = path->nodes[level];
1229
1230 search = btrfs_node_blockptr(node, slot);
1231 blocksize = btrfs_level_size(root, level - 1);
1232 eb = btrfs_find_tree_block(root, search, blocksize);
1233 if (eb) {
1234 free_extent_buffer(eb);
1235 return;
1236 }
1237
1238 highest_read = search;
1239 lowest_read = search;
1240
1241 nritems = btrfs_header_nritems(node);
1242 nr = slot;
1243 while (1) {
1244 if (direction < 0) {
1245 if (nr == 0)
1246 break;
1247 nr--;
1248 } else if (direction > 0) {
1249 nr++;
1250 if (nr >= nritems)
1251 break;
1252 }
1253 if (path->reada < 0 && objectid) {
1254 btrfs_node_key(node, &disk_key, nr);
1255 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1256 break;
1257 }
1258 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) ||
1260 (search < lowest_read && lowest_read - search <= 16384) ||
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize;
1265 }
1266 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1268 break;
1269
1270 if (nread > (256 * 1024) || nscan > 128)
1271 break;
1272
1273 if (search < lowest_read)
1274 lowest_read = search;
1275 if (search > highest_read)
1276 highest_read = search;
1277 }
1278}
1279
1280/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because
1283 * operations on the tree might require changing key pointers higher up in the
1284 * tree.
1285 *
1286 * callers might also have set path->keep_locks, which tells this code to keep
1287 * the lock if the path points to the last slot in the block. This is part of
1288 * walking through the tree, and selecting the next slot in the higher block.
1289 *
1290 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1291 * if lowest_unlock is 1, level 0 won't be unlocked
1292 */
1293static noinline void unlock_up(struct btrfs_path *path, int level,
1294 int lowest_unlock)
1295{
1296 int i;
1297 int skip_level = level;
1298 int no_skips = 0;
1299 struct extent_buffer *t;
1300
1301 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1302 if (!path->nodes[i])
1303 break;
1304 if (!path->locks[i])
1305 break;
1306 if (!no_skips && path->slots[i] == 0) {
1307 skip_level = i + 1;
1308 continue;
1309 }
1310 if (!no_skips && path->keep_locks) {
1311 u32 nritems;
1312 t = path->nodes[i];
1313 nritems = btrfs_header_nritems(t);
1314 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1315 skip_level = i + 1;
1316 continue;
1317 }
1318 }
1319 if (skip_level < i && i >= lowest_unlock)
1320 no_skips = 1;
1321
1322 t = path->nodes[i];
1323 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1324 btrfs_tree_unlock(t);
1325 path->locks[i] = 0;
1326 }
1327 }
1328}
1329
1330/*
1331 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0)
1334 *
1335 * If the key isn't found, the path points to the slot where it should
1336 * be inserted, and 1 is returned. If there are other errors during the
1337 * search a negative error number is returned.
1338 *
1339 * if ins_len > 0, nodes and leaves will be split as we walk down the
1340 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1341 * possible)
1342 */
1343int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1344 *root, struct btrfs_key *key, struct btrfs_path *p, int
1345 ins_len, int cow)
1346{
1347 struct extent_buffer *b;
1348 struct extent_buffer *tmp;
1349 int slot;
1350 int ret;
1351 int level;
1352 int should_reada = p->reada;
1353 int lowest_unlock = 1;
1354 int blocksize;
1355 u8 lowest_level = 0;
1356 u64 blocknr;
1357 u64 gen;
1358 struct btrfs_key prealloc_block;
1359
1360 lowest_level = p->lowest_level;
1361 WARN_ON(lowest_level && ins_len > 0);
1362 WARN_ON(p->nodes[0] != NULL);
1363
1364 if (ins_len < 0)
1365 lowest_unlock = 2;
1366
1367 prealloc_block.objectid = 0;
1368
1369again:
1370 if (p->skip_locking)
1371 b = btrfs_root_node(root);
1372 else
1373 b = btrfs_lock_root_node(root);
1374
1375 while (b) {
1376 level = btrfs_header_level(b);
1377
1378 /*
1379 * setup the path here so we can release it under lock
1380 * contention with the cow code
1381 */
1382 p->nodes[level] = b;
1383 if (!p->skip_locking)
1384 p->locks[level] = 1;
1385
1386 if (cow) {
1387 int wret;
1388
1389 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done;
1396 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398
1399 /* ok, we have to cow, is our old prealloc the right
1400 * size?
1401 */
1402 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) {
1404 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid,
1406 prealloc_block.offset);
1407 prealloc_block.objectid = 0;
1408 }
1409
1410 /*
1411 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held.
1413 */
1414 if (level > 1 && !prealloc_block.objectid &&
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len;
1417 u64 hint = b->start;
1418
1419 btrfs_release_path(root, p);
1420 ret = btrfs_reserve_extent(trans, root,
1421 size, size, 0,
1422 hint, (u64)-1,
1423 &prealloc_block, 0);
1424 BUG_ON(ret);
1425 goto again;
1426 }
1427
1428 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1],
1430 p->slots[level + 1],
1431 &b, prealloc_block.objectid);
1432 prealloc_block.objectid = 0;
1433 if (wret) {
1434 free_extent_buffer(b);
1435 ret = wret;
1436 goto done;
1437 }
1438 }
1439cow_done:
1440 BUG_ON(!cow && ins_len);
1441 if (level != btrfs_header_level(b))
1442 WARN_ON(1);
1443 level = btrfs_header_level(b);
1444
1445 p->nodes[level] = b;
1446 if (!p->skip_locking)
1447 p->locks[level] = 1;
1448
1449 ret = check_block(root, p, level);
1450 if (ret) {
1451 ret = -1;
1452 goto done;
1453 }
1454
1455 ret = bin_search(b, key, level, &slot);
1456 if (level != 0) {
1457 if (ret && slot > 0)
1458 slot -= 1;
1459 p->slots[level] = slot;
1460 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level);
1464 BUG_ON(sret > 0);
1465 if (sret) {
1466 ret = sret;
1467 goto done;
1468 }
1469 b = p->nodes[level];
1470 slot = p->slots[level];
1471 } else if (ins_len < 0) {
1472 int sret = balance_level(trans, root, p,
1473 level);
1474 if (sret) {
1475 ret = sret;
1476 goto done;
1477 }
1478 b = p->nodes[level];
1479 if (!b) {
1480 btrfs_release_path(NULL, p);
1481 goto again;
1482 }
1483 slot = p->slots[level];
1484 BUG_ON(btrfs_header_nritems(b) == 1);
1485 }
1486 unlock_up(p, level, lowest_unlock);
1487
1488 /* this is only true while dropping a snapshot */
1489 if (level == lowest_level) {
1490 ret = 0;
1491 goto done;
1492 }
1493
1494 blocknr = btrfs_node_blockptr(b, slot);
1495 gen = btrfs_node_ptr_generation(b, slot);
1496 blocksize = btrfs_level_size(root, level - 1);
1497
1498 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1499 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1500 b = tmp;
1501 } else {
1502 /*
1503 * reduce lock contention at high levels
1504 * of the btree by dropping locks before
1505 * we read.
1506 */
1507 if (level > 1) {
1508 btrfs_release_path(NULL, p);
1509 if (tmp)
1510 free_extent_buffer(tmp);
1511 if (should_reada)
1512 reada_for_search(root, p,
1513 level, slot,
1514 key->objectid);
1515
1516 tmp = read_tree_block(root, blocknr,
1517 blocksize, gen);
1518 if (tmp)
1519 free_extent_buffer(tmp);
1520 goto again;
1521 } else {
1522 if (tmp)
1523 free_extent_buffer(tmp);
1524 if (should_reada)
1525 reada_for_search(root, p,
1526 level, slot,
1527 key->objectid);
1528 b = read_node_slot(root, b, slot);
1529 }
1530 }
1531 if (!p->skip_locking)
1532 btrfs_tree_lock(b);
1533 } else {
1534 p->slots[level] = slot;
1535 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0);
1539 BUG_ON(sret > 0);
1540 if (sret) {
1541 ret = sret;
1542 goto done;
1543 }
1544 }
1545 if (!p->search_for_split)
1546 unlock_up(p, level, lowest_unlock);
1547 goto done;
1548 }
1549 }
1550 ret = 1;
1551done:
1552 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid,
1555 prealloc_block.offset);
1556 }
1557
1558 return ret;
1559}
1560
1561int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 struct btrfs_root *root,
1563 struct btrfs_key *node_keys,
1564 u64 *nodes, int lowest_level)
1565{
1566 struct extent_buffer *eb;
1567 struct extent_buffer *parent;
1568 struct btrfs_key key;
1569 u64 bytenr;
1570 u64 generation;
1571 u32 blocksize;
1572 int level;
1573 int slot;
1574 int key_match;
1575 int ret;
1576
1577 eb = btrfs_lock_root_node(root);
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret);
1580
1581 parent = eb;
1582 while (1) {
1583 level = btrfs_header_level(parent);
1584 if (level == 0 || level <= lowest_level)
1585 break;
1586
1587 ret = bin_search(parent, &node_keys[lowest_level], level,
1588 &slot);
1589 if (ret && slot > 0)
1590 slot--;
1591
1592 bytenr = btrfs_node_blockptr(parent, slot);
1593 if (nodes[level - 1] == bytenr)
1594 break;
1595
1596 blocksize = btrfs_level_size(root, level - 1);
1597 generation = btrfs_node_ptr_generation(parent, slot);
1598 btrfs_node_key_to_cpu(eb, &key, slot);
1599 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1600
1601 if (generation == trans->transid) {
1602 eb = read_tree_block(root, bytenr, blocksize,
1603 generation);
1604 btrfs_tree_lock(eb);
1605 }
1606
1607 /*
1608 * if node keys match and node pointer hasn't been modified
1609 * in the running transaction, we can merge the path. for
1610 * blocks owened by reloc trees, the node pointer check is
1611 * skipped, this is because these blocks are fully controlled
1612 * by the space balance code, no one else can modify them.
1613 */
1614 if (!nodes[level - 1] || !key_match ||
1615 (generation == trans->transid &&
1616 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1617 if (level == 1 || level == lowest_level + 1) {
1618 if (generation == trans->transid) {
1619 btrfs_tree_unlock(eb);
1620 free_extent_buffer(eb);
1621 }
1622 break;
1623 }
1624
1625 if (generation != trans->transid) {
1626 eb = read_tree_block(root, bytenr, blocksize,
1627 generation);
1628 btrfs_tree_lock(eb);
1629 }
1630
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1632 &eb, 0);
1633 BUG_ON(ret);
1634
1635 if (root->root_key.objectid ==
1636 BTRFS_TREE_RELOC_OBJECTID) {
1637 if (!nodes[level - 1]) {
1638 nodes[level - 1] = eb->start;
1639 memcpy(&node_keys[level - 1], &key,
1640 sizeof(node_keys[0]));
1641 } else {
1642 WARN_ON(1);
1643 }
1644 }
1645
1646 btrfs_tree_unlock(parent);
1647 free_extent_buffer(parent);
1648 parent = eb;
1649 continue;
1650 }
1651
1652 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1653 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1654 btrfs_mark_buffer_dirty(parent);
1655
1656 ret = btrfs_inc_extent_ref(trans, root,
1657 nodes[level - 1],
1658 blocksize, parent->start,
1659 btrfs_header_owner(parent),
1660 btrfs_header_generation(parent),
1661 level - 1);
1662 BUG_ON(ret);
1663
1664 /*
1665 * If the block was created in the running transaction,
1666 * it's possible this is the last reference to it, so we
1667 * should drop the subtree.
1668 */
1669 if (generation == trans->transid) {
1670 ret = btrfs_drop_subtree(trans, root, eb, parent);
1671 BUG_ON(ret);
1672 btrfs_tree_unlock(eb);
1673 free_extent_buffer(eb);
1674 } else {
1675 ret = btrfs_free_extent(trans, root, bytenr,
1676 blocksize, parent->start,
1677 btrfs_header_owner(parent),
1678 btrfs_header_generation(parent),
1679 level - 1, 1);
1680 BUG_ON(ret);
1681 }
1682 break;
1683 }
1684 btrfs_tree_unlock(parent);
1685 free_extent_buffer(parent);
1686 return 0;
1687}
1688
1689/*
1690 * adjust the pointers going up the tree, starting at level
1691 * making sure the right key of each node is points to 'key'.
1692 * This is used after shifting pointers to the left, so it stops
1693 * fixing up pointers when a given leaf/node is not in slot 0 of the
1694 * higher levels
1695 *
1696 * If this fails to write a tree block, it returns -1, but continues
1697 * fixing up the blocks in ram so the tree is consistent.
1698 */
1699static int fixup_low_keys(struct btrfs_trans_handle *trans,
1700 struct btrfs_root *root, struct btrfs_path *path,
1701 struct btrfs_disk_key *key, int level)
1702{
1703 int i;
1704 int ret = 0;
1705 struct extent_buffer *t;
1706
1707 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1708 int tslot = path->slots[i];
1709 if (!path->nodes[i])
1710 break;
1711 t = path->nodes[i];
1712 btrfs_set_node_key(t, key, tslot);
1713 btrfs_mark_buffer_dirty(path->nodes[i]);
1714 if (tslot != 0)
1715 break;
1716 }
1717 return ret;
1718}
1719
1720/*
1721 * update item key.
1722 *
1723 * This function isn't completely safe. It's the caller's responsibility
1724 * that the new key won't break the order
1725 */
1726int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1727 struct btrfs_root *root, struct btrfs_path *path,
1728 struct btrfs_key *new_key)
1729{
1730 struct btrfs_disk_key disk_key;
1731 struct extent_buffer *eb;
1732 int slot;
1733
1734 eb = path->nodes[0];
1735 slot = path->slots[0];
1736 if (slot > 0) {
1737 btrfs_item_key(eb, &disk_key, slot - 1);
1738 if (comp_keys(&disk_key, new_key) >= 0)
1739 return -1;
1740 }
1741 if (slot < btrfs_header_nritems(eb) - 1) {
1742 btrfs_item_key(eb, &disk_key, slot + 1);
1743 if (comp_keys(&disk_key, new_key) <= 0)
1744 return -1;
1745 }
1746
1747 btrfs_cpu_key_to_disk(&disk_key, new_key);
1748 btrfs_set_item_key(eb, &disk_key, slot);
1749 btrfs_mark_buffer_dirty(eb);
1750 if (slot == 0)
1751 fixup_low_keys(trans, root, path, &disk_key, 1);
1752 return 0;
1753}
1754
1755/*
1756 * try to push data from one node into the next node left in the
1757 * tree.
1758 *
1759 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1760 * error, and > 0 if there was no room in the left hand block.
1761 */
1762static int push_node_left(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root, struct extent_buffer *dst,
1764 struct extent_buffer *src, int empty)
1765{
1766 int push_items = 0;
1767 int src_nritems;
1768 int dst_nritems;
1769 int ret = 0;
1770
1771 src_nritems = btrfs_header_nritems(src);
1772 dst_nritems = btrfs_header_nritems(dst);
1773 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1774 WARN_ON(btrfs_header_generation(src) != trans->transid);
1775 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1776
1777 if (!empty && src_nritems <= 8)
1778 return 1;
1779
1780 if (push_items <= 0)
1781 return 1;
1782
1783 if (empty) {
1784 push_items = min(src_nritems, push_items);
1785 if (push_items < src_nritems) {
1786 /* leave at least 8 pointers in the node if
1787 * we aren't going to empty it
1788 */
1789 if (src_nritems - push_items < 8) {
1790 if (push_items <= 8)
1791 return 1;
1792 push_items -= 8;
1793 }
1794 }
1795 } else
1796 push_items = min(src_nritems - 8, push_items);
1797
1798 copy_extent_buffer(dst, src,
1799 btrfs_node_key_ptr_offset(dst_nritems),
1800 btrfs_node_key_ptr_offset(0),
1801 push_items * sizeof(struct btrfs_key_ptr));
1802
1803 if (push_items < src_nritems) {
1804 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1805 btrfs_node_key_ptr_offset(push_items),
1806 (src_nritems - push_items) *
1807 sizeof(struct btrfs_key_ptr));
1808 }
1809 btrfs_set_header_nritems(src, src_nritems - push_items);
1810 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1811 btrfs_mark_buffer_dirty(src);
1812 btrfs_mark_buffer_dirty(dst);
1813
1814 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1815 BUG_ON(ret);
1816
1817 return ret;
1818}
1819
1820/*
1821 * try to push data from one node into the next node right in the
1822 * tree.
1823 *
1824 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1825 * error, and > 0 if there was no room in the right hand block.
1826 *
1827 * this will only push up to 1/2 the contents of the left node over
1828 */
1829static int balance_node_right(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 struct extent_buffer *dst,
1832 struct extent_buffer *src)
1833{
1834 int push_items = 0;
1835 int max_push;
1836 int src_nritems;
1837 int dst_nritems;
1838 int ret = 0;
1839
1840 WARN_ON(btrfs_header_generation(src) != trans->transid);
1841 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843 src_nritems = btrfs_header_nritems(src);
1844 dst_nritems = btrfs_header_nritems(dst);
1845 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1846 if (push_items <= 0)
1847 return 1;
1848
1849 if (src_nritems < 4)
1850 return 1;
1851
1852 max_push = src_nritems / 2 + 1;
1853 /* don't try to empty the node */
1854 if (max_push >= src_nritems)
1855 return 1;
1856
1857 if (max_push < push_items)
1858 push_items = max_push;
1859
1860 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1861 btrfs_node_key_ptr_offset(0),
1862 (dst_nritems) *
1863 sizeof(struct btrfs_key_ptr));
1864
1865 copy_extent_buffer(dst, src,
1866 btrfs_node_key_ptr_offset(0),
1867 btrfs_node_key_ptr_offset(src_nritems - push_items),
1868 push_items * sizeof(struct btrfs_key_ptr));
1869
1870 btrfs_set_header_nritems(src, src_nritems - push_items);
1871 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1872
1873 btrfs_mark_buffer_dirty(src);
1874 btrfs_mark_buffer_dirty(dst);
1875
1876 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1877 BUG_ON(ret);
1878
1879 return ret;
1880}
1881
1882/*
1883 * helper function to insert a new root level in the tree.
1884 * A new node is allocated, and a single item is inserted to
1885 * point to the existing root
1886 *
1887 * returns zero on success or < 0 on failure.
1888 */
1889static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1890 struct btrfs_root *root,
1891 struct btrfs_path *path, int level)
1892{
1893 u64 lower_gen;
1894 struct extent_buffer *lower;
1895 struct extent_buffer *c;
1896 struct extent_buffer *old;
1897 struct btrfs_disk_key lower_key;
1898 int ret;
1899
1900 BUG_ON(path->nodes[level]);
1901 BUG_ON(path->nodes[level-1] != root->node);
1902
1903 lower = path->nodes[level-1];
1904 if (level == 1)
1905 btrfs_item_key(lower, &lower_key, 0);
1906 else
1907 btrfs_node_key(lower, &lower_key, 0);
1908
1909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1910 root->root_key.objectid, trans->transid,
1911 level, root->node->start, 0);
1912 if (IS_ERR(c))
1913 return PTR_ERR(c);
1914
1915 memset_extent_buffer(c, 0, 0, root->nodesize);
1916 btrfs_set_header_nritems(c, 1);
1917 btrfs_set_header_level(c, level);
1918 btrfs_set_header_bytenr(c, c->start);
1919 btrfs_set_header_generation(c, trans->transid);
1920 btrfs_set_header_owner(c, root->root_key.objectid);
1921
1922 write_extent_buffer(c, root->fs_info->fsid,
1923 (unsigned long)btrfs_header_fsid(c),
1924 BTRFS_FSID_SIZE);
1925
1926 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1927 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1928 BTRFS_UUID_SIZE);
1929
1930 btrfs_set_node_key(c, &lower_key, 0);
1931 btrfs_set_node_blockptr(c, 0, lower->start);
1932 lower_gen = btrfs_header_generation(lower);
1933 WARN_ON(lower_gen != trans->transid);
1934
1935 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1936
1937 btrfs_mark_buffer_dirty(c);
1938
1939 spin_lock(&root->node_lock);
1940 old = root->node;
1941 root->node = c;
1942 spin_unlock(&root->node_lock);
1943
1944 ret = btrfs_update_extent_ref(trans, root, lower->start,
1945 lower->start, c->start,
1946 root->root_key.objectid,
1947 trans->transid, level - 1);
1948 BUG_ON(ret);
1949
1950 /* the super has an extra ref to root->node */
1951 free_extent_buffer(old);
1952
1953 add_root_to_dirty_list(root);
1954 extent_buffer_get(c);
1955 path->nodes[level] = c;
1956 path->locks[level] = 1;
1957 path->slots[level] = 0;
1958 return 0;
1959}
1960
1961/*
1962 * worker function to insert a single pointer in a node.
1963 * the node should have enough room for the pointer already
1964 *
1965 * slot and level indicate where you want the key to go, and
1966 * blocknr is the block the key points to.
1967 *
1968 * returns zero on success and < 0 on any error
1969 */
1970static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
1971 *root, struct btrfs_path *path, struct btrfs_disk_key
1972 *key, u64 bytenr, int slot, int level)
1973{
1974 struct extent_buffer *lower;
1975 int nritems;
1976
1977 BUG_ON(!path->nodes[level]);
1978 lower = path->nodes[level];
1979 nritems = btrfs_header_nritems(lower);
1980 if (slot > nritems)
1981 BUG();
1982 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
1983 BUG();
1984 if (slot != nritems) {
1985 memmove_extent_buffer(lower,
1986 btrfs_node_key_ptr_offset(slot + 1),
1987 btrfs_node_key_ptr_offset(slot),
1988 (nritems - slot) * sizeof(struct btrfs_key_ptr));
1989 }
1990 btrfs_set_node_key(lower, key, slot);
1991 btrfs_set_node_blockptr(lower, slot, bytenr);
1992 WARN_ON(trans->transid == 0);
1993 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1994 btrfs_set_header_nritems(lower, nritems + 1);
1995 btrfs_mark_buffer_dirty(lower);
1996 return 0;
1997}
1998
1999/*
2000 * split the node at the specified level in path in two.
2001 * The path is corrected to point to the appropriate node after the split
2002 *
2003 * Before splitting this tries to make some room in the node by pushing
2004 * left and right, if either one works, it returns right away.
2005 *
2006 * returns 0 on success and < 0 on failure
2007 */
2008static noinline int split_node(struct btrfs_trans_handle *trans,
2009 struct btrfs_root *root,
2010 struct btrfs_path *path, int level)
2011{
2012 struct extent_buffer *c;
2013 struct extent_buffer *split;
2014 struct btrfs_disk_key disk_key;
2015 int mid;
2016 int ret;
2017 int wret;
2018 u32 c_nritems;
2019
2020 c = path->nodes[level];
2021 WARN_ON(btrfs_header_generation(c) != trans->transid);
2022 if (c == root->node) {
2023 /* trying to split the root, lets make a new one */
2024 ret = insert_new_root(trans, root, path, level + 1);
2025 if (ret)
2026 return ret;
2027 } else {
2028 ret = push_nodes_for_insert(trans, root, path, level);
2029 c = path->nodes[level];
2030 if (!ret && btrfs_header_nritems(c) <
2031 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2032 return 0;
2033 if (ret < 0)
2034 return ret;
2035 }
2036
2037 c_nritems = btrfs_header_nritems(c);
2038
2039 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2040 path->nodes[level + 1]->start,
2041 root->root_key.objectid,
2042 trans->transid, level, c->start, 0);
2043 if (IS_ERR(split))
2044 return PTR_ERR(split);
2045
2046 btrfs_set_header_flags(split, btrfs_header_flags(c));
2047 btrfs_set_header_level(split, btrfs_header_level(c));
2048 btrfs_set_header_bytenr(split, split->start);
2049 btrfs_set_header_generation(split, trans->transid);
2050 btrfs_set_header_owner(split, root->root_key.objectid);
2051 btrfs_set_header_flags(split, 0);
2052 write_extent_buffer(split, root->fs_info->fsid,
2053 (unsigned long)btrfs_header_fsid(split),
2054 BTRFS_FSID_SIZE);
2055 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2056 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2057 BTRFS_UUID_SIZE);
2058
2059 mid = (c_nritems + 1) / 2;
2060
2061 copy_extent_buffer(split, c,
2062 btrfs_node_key_ptr_offset(0),
2063 btrfs_node_key_ptr_offset(mid),
2064 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2065 btrfs_set_header_nritems(split, c_nritems - mid);
2066 btrfs_set_header_nritems(c, mid);
2067 ret = 0;
2068
2069 btrfs_mark_buffer_dirty(c);
2070 btrfs_mark_buffer_dirty(split);
2071
2072 btrfs_node_key(split, &disk_key, 0);
2073 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2074 path->slots[level + 1] + 1,
2075 level + 1);
2076 if (wret)
2077 ret = wret;
2078
2079 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2080 BUG_ON(ret);
2081
2082 if (path->slots[level] >= mid) {
2083 path->slots[level] -= mid;
2084 btrfs_tree_unlock(c);
2085 free_extent_buffer(c);
2086 path->nodes[level] = split;
2087 path->slots[level + 1] += 1;
2088 } else {
2089 btrfs_tree_unlock(split);
2090 free_extent_buffer(split);
2091 }
2092 return ret;
2093}
2094
2095/*
2096 * how many bytes are required to store the items in a leaf. start
2097 * and nr indicate which items in the leaf to check. This totals up the
2098 * space used both by the item structs and the item data
2099 */
2100static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2101{
2102 int data_len;
2103 int nritems = btrfs_header_nritems(l);
2104 int end = min(nritems, start + nr) - 1;
2105
2106 if (!nr)
2107 return 0;
2108 data_len = btrfs_item_end_nr(l, start);
2109 data_len = data_len - btrfs_item_offset_nr(l, end);
2110 data_len += sizeof(struct btrfs_item) * nr;
2111 WARN_ON(data_len < 0);
2112 return data_len;
2113}
2114
2115/*
2116 * The space between the end of the leaf items and
2117 * the start of the leaf data. IOW, how much room
2118 * the leaf has left for both items and data
2119 */
2120noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2121 struct extent_buffer *leaf)
2122{
2123 int nritems = btrfs_header_nritems(leaf);
2124 int ret;
2125 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2126 if (ret < 0) {
2127 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2128 "used %d nritems %d\n",
2129 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2130 leaf_space_used(leaf, 0, nritems), nritems);
2131 }
2132 return ret;
2133}
2134
2135/*
2136 * push some data in the path leaf to the right, trying to free up at
2137 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2138 *
2139 * returns 1 if the push failed because the other node didn't have enough
2140 * room, 0 if everything worked out and < 0 if there were major errors.
2141 */
2142static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2143 *root, struct btrfs_path *path, int data_size,
2144 int empty)
2145{
2146 struct extent_buffer *left = path->nodes[0];
2147 struct extent_buffer *right;
2148 struct extent_buffer *upper;
2149 struct btrfs_disk_key disk_key;
2150 int slot;
2151 u32 i;
2152 int free_space;
2153 int push_space = 0;
2154 int push_items = 0;
2155 struct btrfs_item *item;
2156 u32 left_nritems;
2157 u32 nr;
2158 u32 right_nritems;
2159 u32 data_end;
2160 u32 this_item_size;
2161 int ret;
2162
2163 slot = path->slots[1];
2164 if (!path->nodes[1])
2165 return 1;
2166
2167 upper = path->nodes[1];
2168 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1;
2170
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2172
2173 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right);
2175 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size)
2177 goto out_unlock;
2178
2179 /* cow and double check */
2180 ret = btrfs_cow_block(trans, root, right, upper,
2181 slot + 1, &right, 0);
2182 if (ret)
2183 goto out_unlock;
2184
2185 free_space = btrfs_leaf_free_space(root, right);
2186 if (free_space < data_size)
2187 goto out_unlock;
2188
2189 left_nritems = btrfs_header_nritems(left);
2190 if (left_nritems == 0)
2191 goto out_unlock;
2192
2193 if (empty)
2194 nr = 0;
2195 else
2196 nr = 1;
2197
2198 if (path->slots[0] >= left_nritems)
2199 push_space += data_size;
2200
2201 i = left_nritems - 1;
2202 while (i >= nr) {
2203 item = btrfs_item_nr(left, i);
2204
2205 if (!empty && push_items > 0) {
2206 if (path->slots[0] > i)
2207 break;
2208 if (path->slots[0] == i) {
2209 int space = btrfs_leaf_free_space(root, left);
2210 if (space + push_space * 2 > free_space)
2211 break;
2212 }
2213 }
2214
2215 if (path->slots[0] == i)
2216 push_space += data_size;
2217
2218 if (!left->map_token) {
2219 map_extent_buffer(left, (unsigned long)item,
2220 sizeof(struct btrfs_item),
2221 &left->map_token, &left->kaddr,
2222 &left->map_start, &left->map_len,
2223 KM_USER1);
2224 }
2225
2226 this_item_size = btrfs_item_size(left, item);
2227 if (this_item_size + sizeof(*item) + push_space > free_space)
2228 break;
2229
2230 push_items++;
2231 push_space += this_item_size + sizeof(*item);
2232 if (i == 0)
2233 break;
2234 i--;
2235 }
2236 if (left->map_token) {
2237 unmap_extent_buffer(left, left->map_token, KM_USER1);
2238 left->map_token = NULL;
2239 }
2240
2241 if (push_items == 0)
2242 goto out_unlock;
2243
2244 if (!empty && push_items == left_nritems)
2245 WARN_ON(1);
2246
2247 /* push left to right */
2248 right_nritems = btrfs_header_nritems(right);
2249
2250 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2251 push_space -= leaf_data_end(root, left);
2252
2253 /* make room in the right data area */
2254 data_end = leaf_data_end(root, right);
2255 memmove_extent_buffer(right,
2256 btrfs_leaf_data(right) + data_end - push_space,
2257 btrfs_leaf_data(right) + data_end,
2258 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2259
2260 /* copy from the left data area */
2261 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2262 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2263 btrfs_leaf_data(left) + leaf_data_end(root, left),
2264 push_space);
2265
2266 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2267 btrfs_item_nr_offset(0),
2268 right_nritems * sizeof(struct btrfs_item));
2269
2270 /* copy the items from left to right */
2271 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2272 btrfs_item_nr_offset(left_nritems - push_items),
2273 push_items * sizeof(struct btrfs_item));
2274
2275 /* update the item pointers */
2276 right_nritems += push_items;
2277 btrfs_set_header_nritems(right, right_nritems);
2278 push_space = BTRFS_LEAF_DATA_SIZE(root);
2279 for (i = 0; i < right_nritems; i++) {
2280 item = btrfs_item_nr(right, i);
2281 if (!right->map_token) {
2282 map_extent_buffer(right, (unsigned long)item,
2283 sizeof(struct btrfs_item),
2284 &right->map_token, &right->kaddr,
2285 &right->map_start, &right->map_len,
2286 KM_USER1);
2287 }
2288 push_space -= btrfs_item_size(right, item);
2289 btrfs_set_item_offset(right, item, push_space);
2290 }
2291
2292 if (right->map_token) {
2293 unmap_extent_buffer(right, right->map_token, KM_USER1);
2294 right->map_token = NULL;
2295 }
2296 left_nritems -= push_items;
2297 btrfs_set_header_nritems(left, left_nritems);
2298
2299 if (left_nritems)
2300 btrfs_mark_buffer_dirty(left);
2301 btrfs_mark_buffer_dirty(right);
2302
2303 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2304 BUG_ON(ret);
2305
2306 btrfs_item_key(right, &disk_key, 0);
2307 btrfs_set_node_key(upper, &disk_key, slot + 1);
2308 btrfs_mark_buffer_dirty(upper);
2309
2310 /* then fixup the leaf pointer in the path */
2311 if (path->slots[0] >= left_nritems) {
2312 path->slots[0] -= left_nritems;
2313 if (btrfs_header_nritems(path->nodes[0]) == 0)
2314 clean_tree_block(trans, root, path->nodes[0]);
2315 btrfs_tree_unlock(path->nodes[0]);
2316 free_extent_buffer(path->nodes[0]);
2317 path->nodes[0] = right;
2318 path->slots[1] += 1;
2319 } else {
2320 btrfs_tree_unlock(right);
2321 free_extent_buffer(right);
2322 }
2323 return 0;
2324
2325out_unlock:
2326 btrfs_tree_unlock(right);
2327 free_extent_buffer(right);
2328 return 1;
2329}
2330
2331/*
2332 * push some data in the path leaf to the left, trying to free up at
2333 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2334 */
2335static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2336 *root, struct btrfs_path *path, int data_size,
2337 int empty)
2338{
2339 struct btrfs_disk_key disk_key;
2340 struct extent_buffer *right = path->nodes[0];
2341 struct extent_buffer *left;
2342 int slot;
2343 int i;
2344 int free_space;
2345 int push_space = 0;
2346 int push_items = 0;
2347 struct btrfs_item *item;
2348 u32 old_left_nritems;
2349 u32 right_nritems;
2350 u32 nr;
2351 int ret = 0;
2352 int wret;
2353 u32 this_item_size;
2354 u32 old_left_item_size;
2355
2356 slot = path->slots[1];
2357 if (slot == 0)
2358 return 1;
2359 if (!path->nodes[1])
2360 return 1;
2361
2362 right_nritems = btrfs_header_nritems(right);
2363 if (right_nritems == 0)
2364 return 1;
2365
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2367
2368 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left);
2370 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) {
2372 ret = 1;
2373 goto out;
2374 }
2375
2376 /* cow and double check */
2377 ret = btrfs_cow_block(trans, root, left,
2378 path->nodes[1], slot - 1, &left, 0);
2379 if (ret) {
2380 /* we hit -ENOSPC, but it isn't fatal here */
2381 ret = 1;
2382 goto out;
2383 }
2384
2385 free_space = btrfs_leaf_free_space(root, left);
2386 if (free_space < data_size) {
2387 ret = 1;
2388 goto out;
2389 }
2390
2391 if (empty)
2392 nr = right_nritems;
2393 else
2394 nr = right_nritems - 1;
2395
2396 for (i = 0; i < nr; i++) {
2397 item = btrfs_item_nr(right, i);
2398 if (!right->map_token) {
2399 map_extent_buffer(right, (unsigned long)item,
2400 sizeof(struct btrfs_item),
2401 &right->map_token, &right->kaddr,
2402 &right->map_start, &right->map_len,
2403 KM_USER1);
2404 }
2405
2406 if (!empty && push_items > 0) {
2407 if (path->slots[0] < i)
2408 break;
2409 if (path->slots[0] == i) {
2410 int space = btrfs_leaf_free_space(root, right);
2411 if (space + push_space * 2 > free_space)
2412 break;
2413 }
2414 }
2415
2416 if (path->slots[0] == i)
2417 push_space += data_size;
2418
2419 this_item_size = btrfs_item_size(right, item);
2420 if (this_item_size + sizeof(*item) + push_space > free_space)
2421 break;
2422
2423 push_items++;
2424 push_space += this_item_size + sizeof(*item);
2425 }
2426
2427 if (right->map_token) {
2428 unmap_extent_buffer(right, right->map_token, KM_USER1);
2429 right->map_token = NULL;
2430 }
2431
2432 if (push_items == 0) {
2433 ret = 1;
2434 goto out;
2435 }
2436 if (!empty && push_items == btrfs_header_nritems(right))
2437 WARN_ON(1);
2438
2439 /* push data from right to left */
2440 copy_extent_buffer(left, right,
2441 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2442 btrfs_item_nr_offset(0),
2443 push_items * sizeof(struct btrfs_item));
2444
2445 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2446 btrfs_item_offset_nr(right, push_items - 1);
2447
2448 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2449 leaf_data_end(root, left) - push_space,
2450 btrfs_leaf_data(right) +
2451 btrfs_item_offset_nr(right, push_items - 1),
2452 push_space);
2453 old_left_nritems = btrfs_header_nritems(left);
2454 BUG_ON(old_left_nritems <= 0);
2455
2456 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2457 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2458 u32 ioff;
2459
2460 item = btrfs_item_nr(left, i);
2461 if (!left->map_token) {
2462 map_extent_buffer(left, (unsigned long)item,
2463 sizeof(struct btrfs_item),
2464 &left->map_token, &left->kaddr,
2465 &left->map_start, &left->map_len,
2466 KM_USER1);
2467 }
2468
2469 ioff = btrfs_item_offset(left, item);
2470 btrfs_set_item_offset(left, item,
2471 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2472 }
2473 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2474 if (left->map_token) {
2475 unmap_extent_buffer(left, left->map_token, KM_USER1);
2476 left->map_token = NULL;
2477 }
2478
2479 /* fixup right node */
2480 if (push_items > right_nritems) {
2481 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2482 right_nritems);
2483 WARN_ON(1);
2484 }
2485
2486 if (push_items < right_nritems) {
2487 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2488 leaf_data_end(root, right);
2489 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2490 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2491 btrfs_leaf_data(right) +
2492 leaf_data_end(root, right), push_space);
2493
2494 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2495 btrfs_item_nr_offset(push_items),
2496 (btrfs_header_nritems(right) - push_items) *
2497 sizeof(struct btrfs_item));
2498 }
2499 right_nritems -= push_items;
2500 btrfs_set_header_nritems(right, right_nritems);
2501 push_space = BTRFS_LEAF_DATA_SIZE(root);
2502 for (i = 0; i < right_nritems; i++) {
2503 item = btrfs_item_nr(right, i);
2504
2505 if (!right->map_token) {
2506 map_extent_buffer(right, (unsigned long)item,
2507 sizeof(struct btrfs_item),
2508 &right->map_token, &right->kaddr,
2509 &right->map_start, &right->map_len,
2510 KM_USER1);
2511 }
2512
2513 push_space = push_space - btrfs_item_size(right, item);
2514 btrfs_set_item_offset(right, item, push_space);
2515 }
2516 if (right->map_token) {
2517 unmap_extent_buffer(right, right->map_token, KM_USER1);
2518 right->map_token = NULL;
2519 }
2520
2521 btrfs_mark_buffer_dirty(left);
2522 if (right_nritems)
2523 btrfs_mark_buffer_dirty(right);
2524
2525 ret = btrfs_update_ref(trans, root, right, left,
2526 old_left_nritems, push_items);
2527 BUG_ON(ret);
2528
2529 btrfs_item_key(right, &disk_key, 0);
2530 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2531 if (wret)
2532 ret = wret;
2533
2534 /* then fixup the leaf pointer in the path */
2535 if (path->slots[0] < push_items) {
2536 path->slots[0] += old_left_nritems;
2537 if (btrfs_header_nritems(path->nodes[0]) == 0)
2538 clean_tree_block(trans, root, path->nodes[0]);
2539 btrfs_tree_unlock(path->nodes[0]);
2540 free_extent_buffer(path->nodes[0]);
2541 path->nodes[0] = left;
2542 path->slots[1] -= 1;
2543 } else {
2544 btrfs_tree_unlock(left);
2545 free_extent_buffer(left);
2546 path->slots[0] -= push_items;
2547 }
2548 BUG_ON(path->slots[0] < 0);
2549 return ret;
2550out:
2551 btrfs_tree_unlock(left);
2552 free_extent_buffer(left);
2553 return ret;
2554}
2555
2556/*
2557 * split the path's leaf in two, making sure there is at least data_size
2558 * available for the resulting leaf level of the path.
2559 *
2560 * returns 0 if all went well and < 0 on failure.
2561 */
2562static noinline int split_leaf(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root,
2564 struct btrfs_key *ins_key,
2565 struct btrfs_path *path, int data_size,
2566 int extend)
2567{
2568 struct extent_buffer *l;
2569 u32 nritems;
2570 int mid;
2571 int slot;
2572 struct extent_buffer *right;
2573 int data_copy_size;
2574 int rt_data_off;
2575 int i;
2576 int ret = 0;
2577 int wret;
2578 int double_split;
2579 int num_doubles = 0;
2580 struct btrfs_disk_key disk_key;
2581
2582 /* first try to make some room by pushing left and right */
2583 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2584 wret = push_leaf_right(trans, root, path, data_size, 0);
2585 if (wret < 0)
2586 return wret;
2587 if (wret) {
2588 wret = push_leaf_left(trans, root, path, data_size, 0);
2589 if (wret < 0)
2590 return wret;
2591 }
2592 l = path->nodes[0];
2593
2594 /* did the pushes work? */
2595 if (btrfs_leaf_free_space(root, l) >= data_size)
2596 return 0;
2597 }
2598
2599 if (!path->nodes[1]) {
2600 ret = insert_new_root(trans, root, path, 1);
2601 if (ret)
2602 return ret;
2603 }
2604again:
2605 double_split = 0;
2606 l = path->nodes[0];
2607 slot = path->slots[0];
2608 nritems = btrfs_header_nritems(l);
2609 mid = (nritems + 1) / 2;
2610
2611 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2612 path->nodes[1]->start,
2613 root->root_key.objectid,
2614 trans->transid, 0, l->start, 0);
2615 if (IS_ERR(right)) {
2616 BUG_ON(1);
2617 return PTR_ERR(right);
2618 }
2619
2620 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2621 btrfs_set_header_bytenr(right, right->start);
2622 btrfs_set_header_generation(right, trans->transid);
2623 btrfs_set_header_owner(right, root->root_key.objectid);
2624 btrfs_set_header_level(right, 0);
2625 write_extent_buffer(right, root->fs_info->fsid,
2626 (unsigned long)btrfs_header_fsid(right),
2627 BTRFS_FSID_SIZE);
2628
2629 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2630 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2631 BTRFS_UUID_SIZE);
2632 if (mid <= slot) {
2633 if (nritems == 1 ||
2634 leaf_space_used(l, mid, nritems - mid) + data_size >
2635 BTRFS_LEAF_DATA_SIZE(root)) {
2636 if (slot >= nritems) {
2637 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2638 btrfs_set_header_nritems(right, 0);
2639 wret = insert_ptr(trans, root, path,
2640 &disk_key, right->start,
2641 path->slots[1] + 1, 1);
2642 if (wret)
2643 ret = wret;
2644
2645 btrfs_tree_unlock(path->nodes[0]);
2646 free_extent_buffer(path->nodes[0]);
2647 path->nodes[0] = right;
2648 path->slots[0] = 0;
2649 path->slots[1] += 1;
2650 btrfs_mark_buffer_dirty(right);
2651 return ret;
2652 }
2653 mid = slot;
2654 if (mid != nritems &&
2655 leaf_space_used(l, mid, nritems - mid) +
2656 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2657 double_split = 1;
2658 }
2659 }
2660 } else {
2661 if (leaf_space_used(l, 0, mid) + data_size >
2662 BTRFS_LEAF_DATA_SIZE(root)) {
2663 if (!extend && data_size && slot == 0) {
2664 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2665 btrfs_set_header_nritems(right, 0);
2666 wret = insert_ptr(trans, root, path,
2667 &disk_key,
2668 right->start,
2669 path->slots[1], 1);
2670 if (wret)
2671 ret = wret;
2672 btrfs_tree_unlock(path->nodes[0]);
2673 free_extent_buffer(path->nodes[0]);
2674 path->nodes[0] = right;
2675 path->slots[0] = 0;
2676 if (path->slots[1] == 0) {
2677 wret = fixup_low_keys(trans, root,
2678 path, &disk_key, 1);
2679 if (wret)
2680 ret = wret;
2681 }
2682 btrfs_mark_buffer_dirty(right);
2683 return ret;
2684 } else if ((extend || !data_size) && slot == 0) {
2685 mid = 1;
2686 } else {
2687 mid = slot;
2688 if (mid != nritems &&
2689 leaf_space_used(l, mid, nritems - mid) +
2690 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2691 double_split = 1;
2692 }
2693 }
2694 }
2695 }
2696 nritems = nritems - mid;
2697 btrfs_set_header_nritems(right, nritems);
2698 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2699
2700 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2701 btrfs_item_nr_offset(mid),
2702 nritems * sizeof(struct btrfs_item));
2703
2704 copy_extent_buffer(right, l,
2705 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2706 data_copy_size, btrfs_leaf_data(l) +
2707 leaf_data_end(root, l), data_copy_size);
2708
2709 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2710 btrfs_item_end_nr(l, mid);
2711
2712 for (i = 0; i < nritems; i++) {
2713 struct btrfs_item *item = btrfs_item_nr(right, i);
2714 u32 ioff;
2715
2716 if (!right->map_token) {
2717 map_extent_buffer(right, (unsigned long)item,
2718 sizeof(struct btrfs_item),
2719 &right->map_token, &right->kaddr,
2720 &right->map_start, &right->map_len,
2721 KM_USER1);
2722 }
2723
2724 ioff = btrfs_item_offset(right, item);
2725 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2726 }
2727
2728 if (right->map_token) {
2729 unmap_extent_buffer(right, right->map_token, KM_USER1);
2730 right->map_token = NULL;
2731 }
2732
2733 btrfs_set_header_nritems(l, mid);
2734 ret = 0;
2735 btrfs_item_key(right, &disk_key, 0);
2736 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2737 path->slots[1] + 1, 1);
2738 if (wret)
2739 ret = wret;
2740
2741 btrfs_mark_buffer_dirty(right);
2742 btrfs_mark_buffer_dirty(l);
2743 BUG_ON(path->slots[0] != slot);
2744
2745 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2746 BUG_ON(ret);
2747
2748 if (mid <= slot) {
2749 btrfs_tree_unlock(path->nodes[0]);
2750 free_extent_buffer(path->nodes[0]);
2751 path->nodes[0] = right;
2752 path->slots[0] -= mid;
2753 path->slots[1] += 1;
2754 } else {
2755 btrfs_tree_unlock(right);
2756 free_extent_buffer(right);
2757 }
2758
2759 BUG_ON(path->slots[0] < 0);
2760
2761 if (double_split) {
2762 BUG_ON(num_doubles != 0);
2763 num_doubles++;
2764 goto again;
2765 }
2766 return ret;
2767}
2768
2769/*
2770 * This function splits a single item into two items,
2771 * giving 'new_key' to the new item and splitting the
2772 * old one at split_offset (from the start of the item).
2773 *
2774 * The path may be released by this operation. After
2775 * the split, the path is pointing to the old item. The
2776 * new item is going to be in the same node as the old one.
2777 *
2778 * Note, the item being split must be smaller enough to live alone on
2779 * a tree block with room for one extra struct btrfs_item
2780 *
2781 * This allows us to split the item in place, keeping a lock on the
2782 * leaf the entire time.
2783 */
2784int btrfs_split_item(struct btrfs_trans_handle *trans,
2785 struct btrfs_root *root,
2786 struct btrfs_path *path,
2787 struct btrfs_key *new_key,
2788 unsigned long split_offset)
2789{
2790 u32 item_size;
2791 struct extent_buffer *leaf;
2792 struct btrfs_key orig_key;
2793 struct btrfs_item *item;
2794 struct btrfs_item *new_item;
2795 int ret = 0;
2796 int slot;
2797 u32 nritems;
2798 u32 orig_offset;
2799 struct btrfs_disk_key disk_key;
2800 char *buf;
2801
2802 leaf = path->nodes[0];
2803 btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
2804 if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
2805 goto split;
2806
2807 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2808 btrfs_release_path(root, path);
2809
2810 path->search_for_split = 1;
2811 path->keep_locks = 1;
2812
2813 ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
2814 path->search_for_split = 0;
2815
2816 /* if our item isn't there or got smaller, return now */
2817 if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
2818 path->slots[0])) {
2819 path->keep_locks = 0;
2820 return -EAGAIN;
2821 }
2822
2823 ret = split_leaf(trans, root, &orig_key, path,
2824 sizeof(struct btrfs_item), 1);
2825 path->keep_locks = 0;
2826 BUG_ON(ret);
2827
2828 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830
2831split:
2832 item = btrfs_item_nr(leaf, path->slots[0]);
2833 orig_offset = btrfs_item_offset(leaf, item);
2834 item_size = btrfs_item_size(leaf, item);
2835
2836
2837 buf = kmalloc(item_size, GFP_NOFS);
2838 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
2839 path->slots[0]), item_size);
2840 slot = path->slots[0] + 1;
2841 leaf = path->nodes[0];
2842
2843 nritems = btrfs_header_nritems(leaf);
2844
2845 if (slot != nritems) {
2846 /* shift the items */
2847 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
2848 btrfs_item_nr_offset(slot),
2849 (nritems - slot) * sizeof(struct btrfs_item));
2850
2851 }
2852
2853 btrfs_cpu_key_to_disk(&disk_key, new_key);
2854 btrfs_set_item_key(leaf, &disk_key, slot);
2855
2856 new_item = btrfs_item_nr(leaf, slot);
2857
2858 btrfs_set_item_offset(leaf, new_item, orig_offset);
2859 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
2860
2861 btrfs_set_item_offset(leaf, item,
2862 orig_offset + item_size - split_offset);
2863 btrfs_set_item_size(leaf, item, split_offset);
2864
2865 btrfs_set_header_nritems(leaf, nritems + 1);
2866
2867 /* write the data for the start of the original item */
2868 write_extent_buffer(leaf, buf,
2869 btrfs_item_ptr_offset(leaf, path->slots[0]),
2870 split_offset);
2871
2872 /* write the data for the new item */
2873 write_extent_buffer(leaf, buf + split_offset,
2874 btrfs_item_ptr_offset(leaf, slot),
2875 item_size - split_offset);
2876 btrfs_mark_buffer_dirty(leaf);
2877
2878 ret = 0;
2879 if (btrfs_leaf_free_space(root, leaf) < 0) {
2880 btrfs_print_leaf(root, leaf);
2881 BUG();
2882 }
2883 kfree(buf);
2884 return ret;
2885}
2886
2887/*
2888 * make the item pointed to by the path smaller. new_size indicates
2889 * how small to make it, and from_end tells us if we just chop bytes
2890 * off the end of the item or if we shift the item to chop bytes off
2891 * the front.
2892 */
2893int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2894 struct btrfs_root *root,
2895 struct btrfs_path *path,
2896 u32 new_size, int from_end)
2897{
2898 int ret = 0;
2899 int slot;
2900 int slot_orig;
2901 struct extent_buffer *leaf;
2902 struct btrfs_item *item;
2903 u32 nritems;
2904 unsigned int data_end;
2905 unsigned int old_data_start;
2906 unsigned int old_size;
2907 unsigned int size_diff;
2908 int i;
2909
2910 slot_orig = path->slots[0];
2911 leaf = path->nodes[0];
2912 slot = path->slots[0];
2913
2914 old_size = btrfs_item_size_nr(leaf, slot);
2915 if (old_size == new_size)
2916 return 0;
2917
2918 nritems = btrfs_header_nritems(leaf);
2919 data_end = leaf_data_end(root, leaf);
2920
2921 old_data_start = btrfs_item_offset_nr(leaf, slot);
2922
2923 size_diff = old_size - new_size;
2924
2925 BUG_ON(slot < 0);
2926 BUG_ON(slot >= nritems);
2927
2928 /*
2929 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2930 */
2931 /* first correct the data pointers */
2932 for (i = slot; i < nritems; i++) {
2933 u32 ioff;
2934 item = btrfs_item_nr(leaf, i);
2935
2936 if (!leaf->map_token) {
2937 map_extent_buffer(leaf, (unsigned long)item,
2938 sizeof(struct btrfs_item),
2939 &leaf->map_token, &leaf->kaddr,
2940 &leaf->map_start, &leaf->map_len,
2941 KM_USER1);
2942 }
2943
2944 ioff = btrfs_item_offset(leaf, item);
2945 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2946 }
2947
2948 if (leaf->map_token) {
2949 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2950 leaf->map_token = NULL;
2951 }
2952
2953 /* shift the data */
2954 if (from_end) {
2955 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2956 data_end + size_diff, btrfs_leaf_data(leaf) +
2957 data_end, old_data_start + new_size - data_end);
2958 } else {
2959 struct btrfs_disk_key disk_key;
2960 u64 offset;
2961
2962 btrfs_item_key(leaf, &disk_key, slot);
2963
2964 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2965 unsigned long ptr;
2966 struct btrfs_file_extent_item *fi;
2967
2968 fi = btrfs_item_ptr(leaf, slot,
2969 struct btrfs_file_extent_item);
2970 fi = (struct btrfs_file_extent_item *)(
2971 (unsigned long)fi - size_diff);
2972
2973 if (btrfs_file_extent_type(leaf, fi) ==
2974 BTRFS_FILE_EXTENT_INLINE) {
2975 ptr = btrfs_item_ptr_offset(leaf, slot);
2976 memmove_extent_buffer(leaf, ptr,
2977 (unsigned long)fi,
2978 offsetof(struct btrfs_file_extent_item,
2979 disk_bytenr));
2980 }
2981 }
2982
2983 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2984 data_end + size_diff, btrfs_leaf_data(leaf) +
2985 data_end, old_data_start - data_end);
2986
2987 offset = btrfs_disk_key_offset(&disk_key);
2988 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2989 btrfs_set_item_key(leaf, &disk_key, slot);
2990 if (slot == 0)
2991 fixup_low_keys(trans, root, path, &disk_key, 1);
2992 }
2993
2994 item = btrfs_item_nr(leaf, slot);
2995 btrfs_set_item_size(leaf, item, new_size);
2996 btrfs_mark_buffer_dirty(leaf);
2997
2998 ret = 0;
2999 if (btrfs_leaf_free_space(root, leaf) < 0) {
3000 btrfs_print_leaf(root, leaf);
3001 BUG();
3002 }
3003 return ret;
3004}
3005
3006/*
3007 * make the item pointed to by the path bigger, data_size is the new size.
3008 */
3009int btrfs_extend_item(struct btrfs_trans_handle *trans,
3010 struct btrfs_root *root, struct btrfs_path *path,
3011 u32 data_size)
3012{
3013 int ret = 0;
3014 int slot;
3015 int slot_orig;
3016 struct extent_buffer *leaf;
3017 struct btrfs_item *item;
3018 u32 nritems;
3019 unsigned int data_end;
3020 unsigned int old_data;
3021 unsigned int old_size;
3022 int i;
3023
3024 slot_orig = path->slots[0];
3025 leaf = path->nodes[0];
3026
3027 nritems = btrfs_header_nritems(leaf);
3028 data_end = leaf_data_end(root, leaf);
3029
3030 if (btrfs_leaf_free_space(root, leaf) < data_size) {
3031 btrfs_print_leaf(root, leaf);
3032 BUG();
3033 }
3034 slot = path->slots[0];
3035 old_data = btrfs_item_end_nr(leaf, slot);
3036
3037 BUG_ON(slot < 0);
3038 if (slot >= nritems) {
3039 btrfs_print_leaf(root, leaf);
3040 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3041 slot, nritems);
3042 BUG_ON(1);
3043 }
3044
3045 /*
3046 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3047 */
3048 /* first correct the data pointers */
3049 for (i = slot; i < nritems; i++) {
3050 u32 ioff;
3051 item = btrfs_item_nr(leaf, i);
3052
3053 if (!leaf->map_token) {
3054 map_extent_buffer(leaf, (unsigned long)item,
3055 sizeof(struct btrfs_item),
3056 &leaf->map_token, &leaf->kaddr,
3057 &leaf->map_start, &leaf->map_len,
3058 KM_USER1);
3059 }
3060 ioff = btrfs_item_offset(leaf, item);
3061 btrfs_set_item_offset(leaf, item, ioff - data_size);
3062 }
3063
3064 if (leaf->map_token) {
3065 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3066 leaf->map_token = NULL;
3067 }
3068
3069 /* shift the data */
3070 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3071 data_end - data_size, btrfs_leaf_data(leaf) +
3072 data_end, old_data - data_end);
3073
3074 data_end = old_data;
3075 old_size = btrfs_item_size_nr(leaf, slot);
3076 item = btrfs_item_nr(leaf, slot);
3077 btrfs_set_item_size(leaf, item, old_size + data_size);
3078 btrfs_mark_buffer_dirty(leaf);
3079
3080 ret = 0;
3081 if (btrfs_leaf_free_space(root, leaf) < 0) {
3082 btrfs_print_leaf(root, leaf);
3083 BUG();
3084 }
3085 return ret;
3086}
3087
3088/*
3089 * Given a key and some data, insert items into the tree.
3090 * This does all the path init required, making room in the tree if needed.
3091 * Returns the number of keys that were inserted.
3092 */
3093int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3094 struct btrfs_root *root,
3095 struct btrfs_path *path,
3096 struct btrfs_key *cpu_key, u32 *data_size,
3097 int nr)
3098{
3099 struct extent_buffer *leaf;
3100 struct btrfs_item *item;
3101 int ret = 0;
3102 int slot;
3103 int i;
3104 u32 nritems;
3105 u32 total_data = 0;
3106 u32 total_size = 0;
3107 unsigned int data_end;
3108 struct btrfs_disk_key disk_key;
3109 struct btrfs_key found_key;
3110
3111 for (i = 0; i < nr; i++) {
3112 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3113 BTRFS_LEAF_DATA_SIZE(root)) {
3114 break;
3115 nr = i;
3116 }
3117 total_data += data_size[i];
3118 total_size += data_size[i] + sizeof(struct btrfs_item);
3119 }
3120 BUG_ON(nr == 0);
3121
3122 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3123 if (ret == 0)
3124 return -EEXIST;
3125 if (ret < 0)
3126 goto out;
3127
3128 leaf = path->nodes[0];
3129
3130 nritems = btrfs_header_nritems(leaf);
3131 data_end = leaf_data_end(root, leaf);
3132
3133 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3134 for (i = nr; i >= 0; i--) {
3135 total_data -= data_size[i];
3136 total_size -= data_size[i] + sizeof(struct btrfs_item);
3137 if (total_size < btrfs_leaf_free_space(root, leaf))
3138 break;
3139 }
3140 nr = i;
3141 }
3142
3143 slot = path->slots[0];
3144 BUG_ON(slot < 0);
3145
3146 if (slot != nritems) {
3147 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3148
3149 item = btrfs_item_nr(leaf, slot);
3150 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3151
3152 /* figure out how many keys we can insert in here */
3153 total_data = data_size[0];
3154 for (i = 1; i < nr; i++) {
3155 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3156 break;
3157 total_data += data_size[i];
3158 }
3159 nr = i;
3160
3161 if (old_data < data_end) {
3162 btrfs_print_leaf(root, leaf);
3163 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3164 slot, old_data, data_end);
3165 BUG_ON(1);
3166 }
3167 /*
3168 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3169 */
3170 /* first correct the data pointers */
3171 WARN_ON(leaf->map_token);
3172 for (i = slot; i < nritems; i++) {
3173 u32 ioff;
3174
3175 item = btrfs_item_nr(leaf, i);
3176 if (!leaf->map_token) {
3177 map_extent_buffer(leaf, (unsigned long)item,
3178 sizeof(struct btrfs_item),
3179 &leaf->map_token, &leaf->kaddr,
3180 &leaf->map_start, &leaf->map_len,
3181 KM_USER1);
3182 }
3183
3184 ioff = btrfs_item_offset(leaf, item);
3185 btrfs_set_item_offset(leaf, item, ioff - total_data);
3186 }
3187 if (leaf->map_token) {
3188 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3189 leaf->map_token = NULL;
3190 }
3191
3192 /* shift the items */
3193 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3194 btrfs_item_nr_offset(slot),
3195 (nritems - slot) * sizeof(struct btrfs_item));
3196
3197 /* shift the data */
3198 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3199 data_end - total_data, btrfs_leaf_data(leaf) +
3200 data_end, old_data - data_end);
3201 data_end = old_data;
3202 } else {
3203 /*
3204 * this sucks but it has to be done, if we are inserting at
3205 * the end of the leaf only insert 1 of the items, since we
3206 * have no way of knowing whats on the next leaf and we'd have
3207 * to drop our current locks to figure it out
3208 */
3209 nr = 1;
3210 }
3211
3212 /* setup the item for the new data */
3213 for (i = 0; i < nr; i++) {
3214 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3215 btrfs_set_item_key(leaf, &disk_key, slot + i);
3216 item = btrfs_item_nr(leaf, slot + i);
3217 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3218 data_end -= data_size[i];
3219 btrfs_set_item_size(leaf, item, data_size[i]);
3220 }
3221 btrfs_set_header_nritems(leaf, nritems + nr);
3222 btrfs_mark_buffer_dirty(leaf);
3223
3224 ret = 0;
3225 if (slot == 0) {
3226 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3227 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3228 }
3229
3230 if (btrfs_leaf_free_space(root, leaf) < 0) {
3231 btrfs_print_leaf(root, leaf);
3232 BUG();
3233 }
3234out:
3235 if (!ret)
3236 ret = nr;
3237 return ret;
3238}
3239
3240/*
3241 * Given a key and some data, insert items into the tree.
3242 * This does all the path init required, making room in the tree if needed.
3243 */
3244int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3245 struct btrfs_root *root,
3246 struct btrfs_path *path,
3247 struct btrfs_key *cpu_key, u32 *data_size,
3248 int nr)
3249{
3250 struct extent_buffer *leaf;
3251 struct btrfs_item *item;
3252 int ret = 0;
3253 int slot;
3254 int slot_orig;
3255 int i;
3256 u32 nritems;
3257 u32 total_size = 0;
3258 u32 total_data = 0;
3259 unsigned int data_end;
3260 struct btrfs_disk_key disk_key;
3261
3262 for (i = 0; i < nr; i++)
3263 total_data += data_size[i];
3264
3265 total_size = total_data + (nr * sizeof(struct btrfs_item));
3266 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3267 if (ret == 0)
3268 return -EEXIST;
3269 if (ret < 0)
3270 goto out;
3271
3272 slot_orig = path->slots[0];
3273 leaf = path->nodes[0];
3274
3275 nritems = btrfs_header_nritems(leaf);
3276 data_end = leaf_data_end(root, leaf);
3277
3278 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3279 btrfs_print_leaf(root, leaf);
3280 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3281 total_size, btrfs_leaf_free_space(root, leaf));
3282 BUG();
3283 }
3284
3285 slot = path->slots[0];
3286 BUG_ON(slot < 0);
3287
3288 if (slot != nritems) {
3289 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3290
3291 if (old_data < data_end) {
3292 btrfs_print_leaf(root, leaf);
3293 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3294 slot, old_data, data_end);
3295 BUG_ON(1);
3296 }
3297 /*
3298 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3299 */
3300 /* first correct the data pointers */
3301 WARN_ON(leaf->map_token);
3302 for (i = slot; i < nritems; i++) {
3303 u32 ioff;
3304
3305 item = btrfs_item_nr(leaf, i);
3306 if (!leaf->map_token) {
3307 map_extent_buffer(leaf, (unsigned long)item,
3308 sizeof(struct btrfs_item),
3309 &leaf->map_token, &leaf->kaddr,
3310 &leaf->map_start, &leaf->map_len,
3311 KM_USER1);
3312 }
3313
3314 ioff = btrfs_item_offset(leaf, item);
3315 btrfs_set_item_offset(leaf, item, ioff - total_data);
3316 }
3317 if (leaf->map_token) {
3318 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3319 leaf->map_token = NULL;
3320 }
3321
3322 /* shift the items */
3323 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3324 btrfs_item_nr_offset(slot),
3325 (nritems - slot) * sizeof(struct btrfs_item));
3326
3327 /* shift the data */
3328 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3329 data_end - total_data, btrfs_leaf_data(leaf) +
3330 data_end, old_data - data_end);
3331 data_end = old_data;
3332 }
3333
3334 /* setup the item for the new data */
3335 for (i = 0; i < nr; i++) {
3336 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3337 btrfs_set_item_key(leaf, &disk_key, slot + i);
3338 item = btrfs_item_nr(leaf, slot + i);
3339 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3340 data_end -= data_size[i];
3341 btrfs_set_item_size(leaf, item, data_size[i]);
3342 }
3343 btrfs_set_header_nritems(leaf, nritems + nr);
3344 btrfs_mark_buffer_dirty(leaf);
3345
3346 ret = 0;
3347 if (slot == 0) {
3348 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3349 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3350 }
3351
3352 if (btrfs_leaf_free_space(root, leaf) < 0) {
3353 btrfs_print_leaf(root, leaf);
3354 BUG();
3355 }
3356out:
3357 return ret;
3358}
3359
3360/*
3361 * Given a key and some data, insert an item into the tree.
3362 * This does all the path init required, making room in the tree if needed.
3363 */
3364int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3365 *root, struct btrfs_key *cpu_key, void *data, u32
3366 data_size)
3367{
3368 int ret = 0;
3369 struct btrfs_path *path;
3370 struct extent_buffer *leaf;
3371 unsigned long ptr;
3372
3373 path = btrfs_alloc_path();
3374 BUG_ON(!path);
3375 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3376 if (!ret) {
3377 leaf = path->nodes[0];
3378 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3379 write_extent_buffer(leaf, data, ptr, data_size);
3380 btrfs_mark_buffer_dirty(leaf);
3381 }
3382 btrfs_free_path(path);
3383 return ret;
3384}
3385
3386/*
3387 * delete the pointer from a given node.
3388 *
3389 * the tree should have been previously balanced so the deletion does not
3390 * empty a node.
3391 */
3392static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3393 struct btrfs_path *path, int level, int slot)
3394{
3395 struct extent_buffer *parent = path->nodes[level];
3396 u32 nritems;
3397 int ret = 0;
3398 int wret;
3399
3400 nritems = btrfs_header_nritems(parent);
3401 if (slot != nritems - 1) {
3402 memmove_extent_buffer(parent,
3403 btrfs_node_key_ptr_offset(slot),
3404 btrfs_node_key_ptr_offset(slot + 1),
3405 sizeof(struct btrfs_key_ptr) *
3406 (nritems - slot - 1));
3407 }
3408 nritems--;
3409 btrfs_set_header_nritems(parent, nritems);
3410 if (nritems == 0 && parent == root->node) {
3411 BUG_ON(btrfs_header_level(root->node) != 1);
3412 /* just turn the root into a leaf and break */
3413 btrfs_set_header_level(root->node, 0);
3414 } else if (slot == 0) {
3415 struct btrfs_disk_key disk_key;
3416
3417 btrfs_node_key(parent, &disk_key, 0);
3418 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3419 if (wret)
3420 ret = wret;
3421 }
3422 btrfs_mark_buffer_dirty(parent);
3423 return ret;
3424}
3425
3426/*
3427 * a helper function to delete the leaf pointed to by path->slots[1] and
3428 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3429 * already know it, it is faster to have them pass it down than to
3430 * read it out of the node again.
3431 *
3432 * This deletes the pointer in path->nodes[1] and frees the leaf
3433 * block extent. zero is returned if it all worked out, < 0 otherwise.
3434 *
3435 * The path must have already been setup for deleting the leaf, including
3436 * all the proper balancing. path->nodes[1] must be locked.
3437 */
3438noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3439 struct btrfs_root *root,
3440 struct btrfs_path *path, u64 bytenr)
3441{
3442 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3444
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret)
3447 return ret;
3448
3449 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0),
3451 path->nodes[1]->start,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1);
3454 return ret;
3455}
3456/*
3457 * delete the item at the leaf level in path. If that empties
3458 * the leaf, remove it from the tree
3459 */
3460int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3461 struct btrfs_path *path, int slot, int nr)
3462{
3463 struct extent_buffer *leaf;
3464 struct btrfs_item *item;
3465 int last_off;
3466 int dsize = 0;
3467 int ret = 0;
3468 int wret;
3469 int i;
3470 u32 nritems;
3471
3472 leaf = path->nodes[0];
3473 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3474
3475 for (i = 0; i < nr; i++)
3476 dsize += btrfs_item_size_nr(leaf, slot + i);
3477
3478 nritems = btrfs_header_nritems(leaf);
3479
3480 if (slot + nr != nritems) {
3481 int data_end = leaf_data_end(root, leaf);
3482
3483 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3484 data_end + dsize,
3485 btrfs_leaf_data(leaf) + data_end,
3486 last_off - data_end);
3487
3488 for (i = slot + nr; i < nritems; i++) {
3489 u32 ioff;
3490
3491 item = btrfs_item_nr(leaf, i);
3492 if (!leaf->map_token) {
3493 map_extent_buffer(leaf, (unsigned long)item,
3494 sizeof(struct btrfs_item),
3495 &leaf->map_token, &leaf->kaddr,
3496 &leaf->map_start, &leaf->map_len,
3497 KM_USER1);
3498 }
3499 ioff = btrfs_item_offset(leaf, item);
3500 btrfs_set_item_offset(leaf, item, ioff + dsize);
3501 }
3502
3503 if (leaf->map_token) {
3504 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3505 leaf->map_token = NULL;
3506 }
3507
3508 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3509 btrfs_item_nr_offset(slot + nr),
3510 sizeof(struct btrfs_item) *
3511 (nritems - slot - nr));
3512 }
3513 btrfs_set_header_nritems(leaf, nritems - nr);
3514 nritems -= nr;
3515
3516 /* delete the leaf if we've emptied it */
3517 if (nritems == 0) {
3518 if (leaf == root->node) {
3519 btrfs_set_header_level(leaf, 0);
3520 } else {
3521 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3522 BUG_ON(ret);
3523 }
3524 } else {
3525 int used = leaf_space_used(leaf, 0, nritems);
3526 if (slot == 0) {
3527 struct btrfs_disk_key disk_key;
3528
3529 btrfs_item_key(leaf, &disk_key, 0);
3530 wret = fixup_low_keys(trans, root, path,
3531 &disk_key, 1);
3532 if (wret)
3533 ret = wret;
3534 }
3535
3536 /* delete the leaf if it is mostly empty */
3537 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3538 /* push_leaf_left fixes the path.
3539 * make sure the path still points to our leaf
3540 * for possible call to del_ptr below
3541 */
3542 slot = path->slots[1];
3543 extent_buffer_get(leaf);
3544
3545 wret = push_leaf_left(trans, root, path, 1, 1);
3546 if (wret < 0 && wret != -ENOSPC)
3547 ret = wret;
3548
3549 if (path->nodes[0] == leaf &&
3550 btrfs_header_nritems(leaf)) {
3551 wret = push_leaf_right(trans, root, path, 1, 1);
3552 if (wret < 0 && wret != -ENOSPC)
3553 ret = wret;
3554 }
3555
3556 if (btrfs_header_nritems(leaf) == 0) {
3557 path->slots[1] = slot;
3558 ret = btrfs_del_leaf(trans, root, path,
3559 leaf->start);
3560 BUG_ON(ret);
3561 free_extent_buffer(leaf);
3562 } else {
3563 /* if we're still in the path, make sure
3564 * we're dirty. Otherwise, one of the
3565 * push_leaf functions must have already
3566 * dirtied this buffer
3567 */
3568 if (path->nodes[0] == leaf)
3569 btrfs_mark_buffer_dirty(leaf);
3570 free_extent_buffer(leaf);
3571 }
3572 } else {
3573 btrfs_mark_buffer_dirty(leaf);
3574 }
3575 }
3576 return ret;
3577}
3578
3579/*
3580 * search the tree again to find a leaf with lesser keys
3581 * returns 0 if it found something or 1 if there are no lesser leaves.
3582 * returns < 0 on io errors.
3583 *
3584 * This may release the path, and so you may lose any locks held at the
3585 * time you call it.
3586 */
3587int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3588{
3589 struct btrfs_key key;
3590 struct btrfs_disk_key found_key;
3591 int ret;
3592
3593 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3594
3595 if (key.offset > 0)
3596 key.offset--;
3597 else if (key.type > 0)
3598 key.type--;
3599 else if (key.objectid > 0)
3600 key.objectid--;
3601 else
3602 return 1;
3603
3604 btrfs_release_path(root, path);
3605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3606 if (ret < 0)
3607 return ret;
3608 btrfs_item_key(path->nodes[0], &found_key, 0);
3609 ret = comp_keys(&found_key, &key);
3610 if (ret < 0)
3611 return 0;
3612 return 1;
3613}
3614
3615/*
3616 * A helper function to walk down the tree starting at min_key, and looking
3617 * for nodes or leaves that are either in cache or have a minimum
3618 * transaction id. This is used by the btree defrag code, and tree logging
3619 *
3620 * This does not cow, but it does stuff the starting key it finds back
3621 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3622 * key and get a writable path.
3623 *
3624 * This does lock as it descends, and path->keep_locks should be set
3625 * to 1 by the caller.
3626 *
3627 * This honors path->lowest_level to prevent descent past a given level
3628 * of the tree.
3629 *
3630 * min_trans indicates the oldest transaction that you are interested
3631 * in walking through. Any nodes or leaves older than min_trans are
3632 * skipped over (without reading them).
3633 *
3634 * returns zero if something useful was found, < 0 on error and 1 if there
3635 * was nothing in the tree that matched the search criteria.
3636 */
3637int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3638 struct btrfs_key *max_key,
3639 struct btrfs_path *path, int cache_only,
3640 u64 min_trans)
3641{
3642 struct extent_buffer *cur;
3643 struct btrfs_key found_key;
3644 int slot;
3645 int sret;
3646 u32 nritems;
3647 int level;
3648 int ret = 1;
3649
3650 WARN_ON(!path->keep_locks);
3651again:
3652 cur = btrfs_lock_root_node(root);
3653 level = btrfs_header_level(cur);
3654 WARN_ON(path->nodes[level]);
3655 path->nodes[level] = cur;
3656 path->locks[level] = 1;
3657
3658 if (btrfs_header_generation(cur) < min_trans) {
3659 ret = 1;
3660 goto out;
3661 }
3662 while (1) {
3663 nritems = btrfs_header_nritems(cur);
3664 level = btrfs_header_level(cur);
3665 sret = bin_search(cur, min_key, level, &slot);
3666
3667 /* at the lowest level, we're done, setup the path and exit */
3668 if (level == path->lowest_level) {
3669 if (slot >= nritems)
3670 goto find_next_key;
3671 ret = 0;
3672 path->slots[level] = slot;
3673 btrfs_item_key_to_cpu(cur, &found_key, slot);
3674 goto out;
3675 }
3676 if (sret && slot > 0)
3677 slot--;
3678 /*
3679 * check this node pointer against the cache_only and
3680 * min_trans parameters. If it isn't in cache or is too
3681 * old, skip to the next one.
3682 */
3683 while (slot < nritems) {
3684 u64 blockptr;
3685 u64 gen;
3686 struct extent_buffer *tmp;
3687 struct btrfs_disk_key disk_key;
3688
3689 blockptr = btrfs_node_blockptr(cur, slot);
3690 gen = btrfs_node_ptr_generation(cur, slot);
3691 if (gen < min_trans) {
3692 slot++;
3693 continue;
3694 }
3695 if (!cache_only)
3696 break;
3697
3698 if (max_key) {
3699 btrfs_node_key(cur, &disk_key, slot);
3700 if (comp_keys(&disk_key, max_key) >= 0) {
3701 ret = 1;
3702 goto out;
3703 }
3704 }
3705
3706 tmp = btrfs_find_tree_block(root, blockptr,
3707 btrfs_level_size(root, level - 1));
3708
3709 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3710 free_extent_buffer(tmp);
3711 break;
3712 }
3713 if (tmp)
3714 free_extent_buffer(tmp);
3715 slot++;
3716 }
3717find_next_key:
3718 /*
3719 * we didn't find a candidate key in this node, walk forward
3720 * and find another one
3721 */
3722 if (slot >= nritems) {
3723 path->slots[level] = slot;
3724 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans);
3726 if (sret == 0) {
3727 btrfs_release_path(root, path);
3728 goto again;
3729 } else {
3730 goto out;
3731 }
3732 }
3733 /* save our key for returning back */
3734 btrfs_node_key_to_cpu(cur, &found_key, slot);
3735 path->slots[level] = slot;
3736 if (level == path->lowest_level) {
3737 ret = 0;
3738 unlock_up(path, level, 1);
3739 goto out;
3740 }
3741 cur = read_node_slot(root, cur, slot);
3742
3743 btrfs_tree_lock(cur);
3744 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1);
3747 }
3748out:
3749 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key));
3751 return ret;
3752}
3753
3754/*
3755 * this is similar to btrfs_next_leaf, but does not try to preserve
3756 * and fixup the path. It looks for and returns the next key in the
3757 * tree based on the current path and the cache_only and min_trans
3758 * parameters.
3759 *
3760 * 0 is returned if another key is found, < 0 if there are any errors
3761 * and 1 is returned if there are no higher keys in the tree
3762 *
3763 * path->keep_locks should be set to 1 on the search made before
3764 * calling this function.
3765 */
3766int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3767 struct btrfs_key *key, int lowest_level,
3768 int cache_only, u64 min_trans)
3769{
3770 int level = lowest_level;
3771 int slot;
3772 struct extent_buffer *c;
3773
3774 WARN_ON(!path->keep_locks);
3775 while (level < BTRFS_MAX_LEVEL) {
3776 if (!path->nodes[level])
3777 return 1;
3778
3779 slot = path->slots[level] + 1;
3780 c = path->nodes[level];
3781next:
3782 if (slot >= btrfs_header_nritems(c)) {
3783 level++;
3784 if (level == BTRFS_MAX_LEVEL)
3785 return 1;
3786 continue;
3787 }
3788 if (level == 0)
3789 btrfs_item_key_to_cpu(c, key, slot);
3790 else {
3791 u64 blockptr = btrfs_node_blockptr(c, slot);
3792 u64 gen = btrfs_node_ptr_generation(c, slot);
3793
3794 if (cache_only) {
3795 struct extent_buffer *cur;
3796 cur = btrfs_find_tree_block(root, blockptr,
3797 btrfs_level_size(root, level - 1));
3798 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3799 slot++;
3800 if (cur)
3801 free_extent_buffer(cur);
3802 goto next;
3803 }
3804 free_extent_buffer(cur);
3805 }
3806 if (gen < min_trans) {
3807 slot++;
3808 goto next;
3809 }
3810 btrfs_node_key_to_cpu(c, key, slot);
3811 }
3812 return 0;
3813 }
3814 return 1;
3815}
3816
3817/*
3818 * search the tree again to find a leaf with greater keys
3819 * returns 0 if it found something or 1 if there are no greater leaves.
3820 * returns < 0 on io errors.
3821 */
3822int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3823{
3824 int slot;
3825 int level = 1;
3826 struct extent_buffer *c;
3827 struct extent_buffer *next = NULL;
3828 struct btrfs_key key;
3829 u32 nritems;
3830 int ret;
3831
3832 nritems = btrfs_header_nritems(path->nodes[0]);
3833 if (nritems == 0)
3834 return 1;
3835
3836 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3837
3838 btrfs_release_path(root, path);
3839 path->keep_locks = 1;
3840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3841 path->keep_locks = 0;
3842
3843 if (ret < 0)
3844 return ret;
3845
3846 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /*
3848 * by releasing the path above we dropped all our locks. A balance
3849 * could have added more items next to the key that used to be
3850 * at the very end of the block. So, check again here and
3851 * advance the path if there are now more items available.
3852 */
3853 if (nritems > 0 && path->slots[0] < nritems - 1) {
3854 path->slots[0]++;
3855 goto done;
3856 }
3857
3858 while (level < BTRFS_MAX_LEVEL) {
3859 if (!path->nodes[level])
3860 return 1;
3861
3862 slot = path->slots[level] + 1;
3863 c = path->nodes[level];
3864 if (slot >= btrfs_header_nritems(c)) {
3865 level++;
3866 if (level == BTRFS_MAX_LEVEL)
3867 return 1;
3868 continue;
3869 }
3870
3871 if (next) {
3872 btrfs_tree_unlock(next);
3873 free_extent_buffer(next);
3874 }
3875
3876 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada)
3878 reada_for_search(root, path, level, slot, 0);
3879
3880 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next);
3884 }
3885 break;
3886 }
3887 path->slots[level] = slot;
3888 while (1) {
3889 level--;
3890 c = path->nodes[level];
3891 if (path->locks[level])
3892 btrfs_tree_unlock(c);
3893 free_extent_buffer(c);
3894 path->nodes[level] = next;
3895 path->slots[level] = 0;
3896 if (!path->skip_locking)
3897 path->locks[level] = 1;
3898 if (!level)
3899 break;
3900 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next);
3906 }
3907 }
3908done:
3909 unlock_up(path, 0, 1);
3910 return 0;
3911}
3912
3913/*
3914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3915 * searching until it gets past min_objectid or finds an item of 'type'
3916 *
3917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3918 */
3919int btrfs_previous_item(struct btrfs_root *root,
3920 struct btrfs_path *path, u64 min_objectid,
3921 int type)
3922{
3923 struct btrfs_key found_key;
3924 struct extent_buffer *leaf;
3925 u32 nritems;
3926 int ret;
3927
3928 while (1) {
3929 if (path->slots[0] == 0) {
3930 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0)
3932 return ret;
3933 } else {
3934 path->slots[0]--;
3935 }
3936 leaf = path->nodes[0];
3937 nritems = btrfs_header_nritems(leaf);
3938 if (nritems == 0)
3939 return 1;
3940 if (path->slots[0] == nritems)
3941 path->slots[0]--;
3942
3943 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3944 if (found_key.type == type)
3945 return 0;
3946 if (found_key.objectid < min_objectid)
3947 break;
3948 if (found_key.objectid == min_objectid &&
3949 found_key.type < type)
3950 break;
3951 }
3952 return 1;
3953}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BHRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* holds checksums of all the data extents */
77#define BTRFS_CSUM_TREE_OBJECTID 7ULL
78
79/* orhpan objectid for tracking unlinked/truncated files */
80#define BTRFS_ORPHAN_OBJECTID -5ULL
81
82/* does write ahead logging to speed up fsyncs */
83#define BTRFS_TREE_LOG_OBJECTID -6ULL
84#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
85
86/* for space balancing */
87#define BTRFS_TREE_RELOC_OBJECTID -8ULL
88#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
89
90/*
91 * extent checksums all have this objectid
92 * this allows them to share the logging tree
93 * for fsyncs
94 */
95#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
96
97/* dummy objectid represents multiple objectids */
98#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
99
100/*
101 * All files have objectids in this range.
102 */
103#define BTRFS_FIRST_FREE_OBJECTID 256ULL
104#define BTRFS_LAST_FREE_OBJECTID -256ULL
105#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
106
107
108/*
109 * the device items go into the chunk tree. The key is in the form
110 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
111 */
112#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
113
114/*
115 * we can actually store much bigger names, but lets not confuse the rest
116 * of linux
117 */
118#define BTRFS_NAME_LEN 255
119
120/* 32 bytes in various csum fields */
121#define BTRFS_CSUM_SIZE 32
122
123/* csum types */
124#define BTRFS_CSUM_TYPE_CRC32 0
125
126static int btrfs_csum_sizes[] = { 4, 0 };
127
128/* four bytes for CRC32 */
129#define BTRFS_EMPTY_DIR_SIZE 0
130
131#define BTRFS_FT_UNKNOWN 0
132#define BTRFS_FT_REG_FILE 1
133#define BTRFS_FT_DIR 2
134#define BTRFS_FT_CHRDEV 3
135#define BTRFS_FT_BLKDEV 4
136#define BTRFS_FT_FIFO 5
137#define BTRFS_FT_SOCK 6
138#define BTRFS_FT_SYMLINK 7
139#define BTRFS_FT_XATTR 8
140#define BTRFS_FT_MAX 9
141
142/*
143 * the key defines the order in the tree, and so it also defines (optimal)
144 * block layout. objectid corresonds to the inode number. The flags
145 * tells us things about the object, and is a kind of stream selector.
146 * so for a given inode, keys with flags of 1 might refer to the inode
147 * data, flags of 2 may point to file data in the btree and flags == 3
148 * may point to extents.
149 *
150 * offset is the starting byte offset for this key in the stream.
151 *
152 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
153 * in cpu native order. Otherwise they are identical and their sizes
154 * should be the same (ie both packed)
155 */
156struct btrfs_disk_key {
157 __le64 objectid;
158 u8 type;
159 __le64 offset;
160} __attribute__ ((__packed__));
161
162struct btrfs_key {
163 u64 objectid;
164 u8 type;
165 u64 offset;
166} __attribute__ ((__packed__));
167
168struct btrfs_mapping_tree {
169 struct extent_map_tree map_tree;
170};
171
172#define BTRFS_UUID_SIZE 16
173struct btrfs_dev_item {
174 /* the internal btrfs device id */
175 __le64 devid;
176
177 /* size of the device */
178 __le64 total_bytes;
179
180 /* bytes used */
181 __le64 bytes_used;
182
183 /* optimal io alignment for this device */
184 __le32 io_align;
185
186 /* optimal io width for this device */
187 __le32 io_width;
188
189 /* minimal io size for this device */
190 __le32 sector_size;
191
192 /* type and info about this device */
193 __le64 type;
194
195 /* expected generation for this device */
196 __le64 generation;
197
198 /*
199 * starting byte of this partition on the device,
200 * to allowr for stripe alignment in the future
201 */
202 __le64 start_offset;
203
204 /* grouping information for allocation decisions */
205 __le32 dev_group;
206
207 /* seek speed 0-100 where 100 is fastest */
208 u8 seek_speed;
209
210 /* bandwidth 0-100 where 100 is fastest */
211 u8 bandwidth;
212
213 /* btrfs generated uuid for this device */
214 u8 uuid[BTRFS_UUID_SIZE];
215
216 /* uuid of FS who owns this device */
217 u8 fsid[BTRFS_UUID_SIZE];
218} __attribute__ ((__packed__));
219
220struct btrfs_stripe {
221 __le64 devid;
222 __le64 offset;
223 u8 dev_uuid[BTRFS_UUID_SIZE];
224} __attribute__ ((__packed__));
225
226struct btrfs_chunk {
227 /* size of this chunk in bytes */
228 __le64 length;
229
230 /* objectid of the root referencing this chunk */
231 __le64 owner;
232
233 __le64 stripe_len;
234 __le64 type;
235
236 /* optimal io alignment for this chunk */
237 __le32 io_align;
238
239 /* optimal io width for this chunk */
240 __le32 io_width;
241
242 /* minimal io size for this chunk */
243 __le32 sector_size;
244
245 /* 2^16 stripes is quite a lot, a second limit is the size of a single
246 * item in the btree
247 */
248 __le16 num_stripes;
249
250 /* sub stripes only matter for raid10 */
251 __le16 sub_stripes;
252 struct btrfs_stripe stripe;
253 /* additional stripes go here */
254} __attribute__ ((__packed__));
255
256static inline unsigned long btrfs_chunk_item_size(int num_stripes)
257{
258 BUG_ON(num_stripes == 0);
259 return sizeof(struct btrfs_chunk) +
260 sizeof(struct btrfs_stripe) * (num_stripes - 1);
261}
262
263#define BTRFS_FSID_SIZE 16
264#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
265
266/*
267 * every tree block (leaf or node) starts with this header.
268 */
269struct btrfs_header {
270 /* these first four must match the super block */
271 u8 csum[BTRFS_CSUM_SIZE];
272 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
273 __le64 bytenr; /* which block this node is supposed to live in */
274 __le64 flags;
275
276 /* allowed to be different from the super from here on down */
277 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
278 __le64 generation;
279 __le64 owner;
280 __le32 nritems;
281 u8 level;
282} __attribute__ ((__packed__));
283
284#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
285 sizeof(struct btrfs_header)) / \
286 sizeof(struct btrfs_key_ptr))
287#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
288#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
289#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
290 sizeof(struct btrfs_item) - \
291 sizeof(struct btrfs_file_extent_item))
292
293#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
294
295/*
296 * this is a very generous portion of the super block, giving us
297 * room to translate 14 chunks with 3 stripes each.
298 */
299#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
300#define BTRFS_LABEL_SIZE 256
301
302/*
303 * the super block basically lists the main trees of the FS
304 * it currently lacks any block count etc etc
305 */
306struct btrfs_super_block {
307 u8 csum[BTRFS_CSUM_SIZE];
308 /* the first 4 fields must match struct btrfs_header */
309 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
310 __le64 bytenr; /* this block number */
311 __le64 flags;
312
313 /* allowed to be different from the btrfs_header from here own down */
314 __le64 magic;
315 __le64 generation;
316 __le64 root;
317 __le64 chunk_root;
318 __le64 log_root;
319
320 /* this will help find the new super based on the log root */
321 __le64 log_root_transid;
322 __le64 total_bytes;
323 __le64 bytes_used;
324 __le64 root_dir_objectid;
325 __le64 num_devices;
326 __le32 sectorsize;
327 __le32 nodesize;
328 __le32 leafsize;
329 __le32 stripesize;
330 __le32 sys_chunk_array_size;
331 __le64 chunk_root_generation;
332 __le64 compat_flags;
333 __le64 compat_ro_flags;
334 __le64 incompat_flags;
335 __le16 csum_type;
336 u8 root_level;
337 u8 chunk_root_level;
338 u8 log_root_level;
339 struct btrfs_dev_item dev_item;
340
341 char label[BTRFS_LABEL_SIZE];
342
343 /* future expansion */
344 __le64 reserved[32];
345 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
346} __attribute__ ((__packed__));
347
348/*
349 * Compat flags that we support. If any incompat flags are set other than the
350 * ones specified below then we will fail to mount
351 */
352#define BTRFS_FEATURE_COMPAT_SUPP 0x0
353#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0
354#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0
355
356/*
357 * A leaf is full of items. offset and size tell us where to find
358 * the item in the leaf (relative to the start of the data area)
359 */
360struct btrfs_item {
361 struct btrfs_disk_key key;
362 __le32 offset;
363 __le32 size;
364} __attribute__ ((__packed__));
365
366/*
367 * leaves have an item area and a data area:
368 * [item0, item1....itemN] [free space] [dataN...data1, data0]
369 *
370 * The data is separate from the items to get the keys closer together
371 * during searches.
372 */
373struct btrfs_leaf {
374 struct btrfs_header header;
375 struct btrfs_item items[];
376} __attribute__ ((__packed__));
377
378/*
379 * all non-leaf blocks are nodes, they hold only keys and pointers to
380 * other blocks
381 */
382struct btrfs_key_ptr {
383 struct btrfs_disk_key key;
384 __le64 blockptr;
385 __le64 generation;
386} __attribute__ ((__packed__));
387
388struct btrfs_node {
389 struct btrfs_header header;
390 struct btrfs_key_ptr ptrs[];
391} __attribute__ ((__packed__));
392
393/*
394 * btrfs_paths remember the path taken from the root down to the leaf.
395 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
396 * to any other levels that are present.
397 *
398 * The slots array records the index of the item or block pointer
399 * used while walking the tree.
400 */
401struct btrfs_path {
402 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
403 int slots[BTRFS_MAX_LEVEL];
404 /* if there is real range locking, this locks field will change */
405 int locks[BTRFS_MAX_LEVEL];
406 int reada;
407 /* keep some upper locks as we walk down */
408 int keep_locks;
409 int skip_locking;
410 int lowest_level;
411
412 /*
413 * set by btrfs_split_item, tells search_slot to keep all locks
414 * and to force calls to keep space in the nodes
415 */
416 int search_for_split;
417};
418
419/*
420 * items in the extent btree are used to record the objectid of the
421 * owner of the block and the number of references
422 */
423struct btrfs_extent_item {
424 __le32 refs;
425} __attribute__ ((__packed__));
426
427struct btrfs_extent_ref {
428 __le64 root;
429 __le64 generation;
430 __le64 objectid;
431 __le32 num_refs;
432} __attribute__ ((__packed__));
433
434/* dev extents record free space on individual devices. The owner
435 * field points back to the chunk allocation mapping tree that allocated
436 * the extent. The chunk tree uuid field is a way to double check the owner
437 */
438struct btrfs_dev_extent {
439 __le64 chunk_tree;
440 __le64 chunk_objectid;
441 __le64 chunk_offset;
442 __le64 length;
443 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
444} __attribute__ ((__packed__));
445
446struct btrfs_inode_ref {
447 __le64 index;
448 __le16 name_len;
449 /* name goes here */
450} __attribute__ ((__packed__));
451
452struct btrfs_timespec {
453 __le64 sec;
454 __le32 nsec;
455} __attribute__ ((__packed__));
456
457typedef enum {
458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type;
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468
469struct btrfs_inode_item {
470 /* nfs style generation number */
471 __le64 generation;
472 /* transid that last touched this inode */
473 __le64 transid;
474 __le64 size;
475 __le64 nbytes;
476 __le64 block_group;
477 __le32 nlink;
478 __le32 uid;
479 __le32 gid;
480 __le32 mode;
481 __le64 rdev;
482 __le64 flags;
483
484 /* modification sequence number for NFS */
485 __le64 sequence;
486
487 /*
488 * a little future expansion, for more than this we can
489 * just grow the inode item and version it
490 */
491 __le64 reserved[4];
492 struct btrfs_timespec atime;
493 struct btrfs_timespec ctime;
494 struct btrfs_timespec mtime;
495 struct btrfs_timespec otime;
496} __attribute__ ((__packed__));
497
498struct btrfs_dir_log_item {
499 __le64 end;
500} __attribute__ ((__packed__));
501
502struct btrfs_dir_item {
503 struct btrfs_disk_key location;
504 __le64 transid;
505 __le16 data_len;
506 __le16 name_len;
507 u8 type;
508} __attribute__ ((__packed__));
509
510struct btrfs_root_item {
511 struct btrfs_inode_item inode;
512 __le64 generation;
513 __le64 root_dirid;
514 __le64 bytenr;
515 __le64 byte_limit;
516 __le64 bytes_used;
517 __le64 last_snapshot;
518 __le64 flags;
519 __le32 refs;
520 struct btrfs_disk_key drop_progress;
521 u8 drop_level;
522 u8 level;
523} __attribute__ ((__packed__));
524
525/*
526 * this is used for both forward and backward root refs
527 */
528struct btrfs_root_ref {
529 __le64 dirid;
530 __le64 sequence;
531 __le16 name_len;
532} __attribute__ ((__packed__));
533
534#define BTRFS_FILE_EXTENT_INLINE 0
535#define BTRFS_FILE_EXTENT_REG 1
536#define BTRFS_FILE_EXTENT_PREALLOC 2
537
538struct btrfs_file_extent_item {
539 /*
540 * transaction id that created this extent
541 */
542 __le64 generation;
543 /*
544 * max number of bytes to hold this extent in ram
545 * when we split a compressed extent we can't know how big
546 * each of the resulting pieces will be. So, this is
547 * an upper limit on the size of the extent in ram instead of
548 * an exact limit.
549 */
550 __le64 ram_bytes;
551
552 /*
553 * 32 bits for the various ways we might encode the data,
554 * including compression and encryption. If any of these
555 * are set to something a given disk format doesn't understand
556 * it is treated like an incompat flag for reading and writing,
557 * but not for stat.
558 */
559 u8 compression;
560 u8 encryption;
561 __le16 other_encoding; /* spare for later use */
562
563 /* are we inline data or a real extent? */
564 u8 type;
565
566 /*
567 * disk space consumed by the extent, checksum blocks are included
568 * in these numbers
569 */
570 __le64 disk_bytenr;
571 __le64 disk_num_bytes;
572 /*
573 * the logical offset in file blocks (no csums)
574 * this extent record is for. This allows a file extent to point
575 * into the middle of an existing extent on disk, sharing it
576 * between two snapshots (useful if some bytes in the middle of the
577 * extent have changed
578 */
579 __le64 offset;
580 /*
581 * the logical number of file blocks (no csums included). This
582 * always reflects the size uncompressed and without encoding.
583 */
584 __le64 num_bytes;
585
586} __attribute__ ((__packed__));
587
588struct btrfs_csum_item {
589 u8 csum;
590} __attribute__ ((__packed__));
591
592/* different types of block groups (and chunks) */
593#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
594#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
595#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
596#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
597#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
598#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
599#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
600
601struct btrfs_block_group_item {
602 __le64 used;
603 __le64 chunk_objectid;
604 __le64 flags;
605} __attribute__ ((__packed__));
606
607struct btrfs_space_info {
608 u64 flags;
609 u64 total_bytes;
610 u64 bytes_used;
611 u64 bytes_pinned;
612 u64 bytes_reserved;
613 u64 bytes_readonly;
614 int full;
615 int force_alloc;
616 struct list_head list;
617
618 /* for block groups in our same type */
619 struct list_head block_groups;
620 spinlock_t lock;
621 struct rw_semaphore groups_sem;
622};
623
624struct btrfs_free_space {
625 struct rb_node bytes_index;
626 struct rb_node offset_index;
627 u64 offset;
628 u64 bytes;
629};
630
631struct btrfs_block_group_cache {
632 struct btrfs_key key;
633 struct btrfs_block_group_item item;
634 spinlock_t lock;
635 struct mutex alloc_mutex;
636 struct mutex cache_mutex;
637 u64 pinned;
638 u64 reserved;
639 u64 flags;
640 int cached;
641 int ro;
642 int dirty;
643
644 struct btrfs_space_info *space_info;
645
646 /* free space cache stuff */
647 struct rb_root free_space_bytes;
648 struct rb_root free_space_offset;
649
650 /* block group cache stuff */
651 struct rb_node cache_node;
652
653 /* for block groups in the same raid type */
654 struct list_head list;
655
656 /* usage count */
657 atomic_t count;
658};
659
660struct btrfs_leaf_ref_tree {
661 struct rb_root root;
662 struct list_head list;
663 spinlock_t lock;
664};
665
666struct btrfs_device;
667struct btrfs_fs_devices;
668struct btrfs_fs_info {
669 u8 fsid[BTRFS_FSID_SIZE];
670 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
671 struct btrfs_root *extent_root;
672 struct btrfs_root *tree_root;
673 struct btrfs_root *chunk_root;
674 struct btrfs_root *dev_root;
675 struct btrfs_root *fs_root;
676 struct btrfs_root *csum_root;
677
678 /* the log root tree is a directory of all the other log roots */
679 struct btrfs_root *log_root_tree;
680 struct radix_tree_root fs_roots_radix;
681
682 /* block group cache stuff */
683 spinlock_t block_group_cache_lock;
684 struct rb_root block_group_cache_tree;
685
686 struct extent_io_tree pinned_extents;
687 struct extent_io_tree pending_del;
688 struct extent_io_tree extent_ins;
689
690 /* logical->physical extent mapping */
691 struct btrfs_mapping_tree mapping_tree;
692
693 u64 generation;
694 u64 last_trans_committed;
695 u64 last_trans_new_blockgroup;
696 u64 open_ioctl_trans;
697 unsigned long mount_opt;
698 u64 max_extent;
699 u64 max_inline;
700 u64 alloc_start;
701 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707
708 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit;
710 struct block_device *__bdev;
711 struct super_block *sb;
712 struct inode *btree_inode;
713 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex;
716 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex;
718 struct mutex cleaner_mutex;
719 struct mutex extent_ins_mutex;
720 struct mutex pinned_mutex;
721 struct mutex chunk_mutex;
722 struct mutex drop_mutex;
723 struct mutex volume_mutex;
724 struct mutex tree_reloc_mutex;
725 struct list_head trans_list;
726 struct list_head hashers;
727 struct list_head dead_roots;
728
729 atomic_t nr_async_submits;
730 atomic_t async_submit_draining;
731 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737
738 /*
739 * this is used by the balancing code to wait for all the pending
740 * ordered extents
741 */
742 spinlock_t ordered_extent_lock;
743 struct list_head ordered_extents;
744 struct list_head delalloc_inodes;
745
746 /*
747 * there is a pool of worker threads for checksumming during writes
748 * and a pool for checksumming after reads. This is because readers
749 * can run with FS locks held, and the writers may be waiting for
750 * those locks. We don't want ordering in the pending list to cause
751 * deadlocks, and so the two are serviced separately.
752 *
753 * A third pool does submit_bio to avoid deadlocking with the other
754 * two
755 */
756 struct btrfs_workers workers;
757 struct btrfs_workers delalloc_workers;
758 struct btrfs_workers endio_workers;
759 struct btrfs_workers endio_meta_workers;
760 struct btrfs_workers endio_meta_write_workers;
761 struct btrfs_workers endio_write_workers;
762 struct btrfs_workers submit_workers;
763 /*
764 * fixup workers take dirty pages that didn't properly go through
765 * the cow mechanism and make them safe to write. It happens
766 * for the sys_munmap function call path
767 */
768 struct btrfs_workers fixup_workers;
769 struct task_struct *transaction_kthread;
770 struct task_struct *cleaner_kthread;
771 int thread_pool_size;
772
773 /* tree relocation relocated fields */
774 struct list_head dead_reloc_roots;
775 struct btrfs_leaf_ref_tree reloc_ref_tree;
776 struct btrfs_leaf_ref_tree shared_ref_tree;
777
778 struct kobject super_kobj;
779 struct completion kobj_unregister;
780 int do_barriers;
781 int closing;
782 int log_root_recovering;
783 atomic_t throttles;
784 atomic_t throttle_gen;
785
786 u64 total_pinned;
787 struct list_head dirty_cowonly_roots;
788
789 struct btrfs_fs_devices *fs_devices;
790 struct list_head space_info;
791 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock;
793 u64 delalloc_bytes;
794 u64 last_alloc;
795 u64 last_data_alloc;
796
797 spinlock_t ref_cache_lock;
798 u64 total_ref_cache_size;
799
800 u64 avail_data_alloc_bits;
801 u64 avail_metadata_alloc_bits;
802 u64 avail_system_alloc_bits;
803 u64 data_alloc_profile;
804 u64 metadata_alloc_profile;
805 u64 system_alloc_profile;
806
807 void *bdev_holder;
808};
809
810/*
811 * in ram representation of the tree. extent_root is used for all allocations
812 * and for the extent tree extent_root root.
813 */
814struct btrfs_dirty_root;
815struct btrfs_root {
816 struct extent_buffer *node;
817
818 /* the node lock is held while changing the node pointer */
819 spinlock_t node_lock;
820
821 struct extent_buffer *commit_root;
822 struct btrfs_leaf_ref_tree *ref_tree;
823 struct btrfs_leaf_ref_tree ref_tree_struct;
824 struct btrfs_dirty_root *dirty_root;
825 struct btrfs_root *log_root;
826 struct btrfs_root *reloc_root;
827
828 struct btrfs_root_item root_item;
829 struct btrfs_key root_key;
830 struct btrfs_fs_info *fs_info;
831 struct extent_io_tree dirty_log_pages;
832
833 struct kobject root_kobj;
834 struct completion kobj_unregister;
835 struct mutex objectid_mutex;
836 struct mutex log_mutex;
837
838 u64 objectid;
839 u64 last_trans;
840
841 /* data allocations are done in sectorsize units */
842 u32 sectorsize;
843
844 /* node allocations are done in nodesize units */
845 u32 nodesize;
846
847 /* leaf allocations are done in leafsize units */
848 u32 leafsize;
849
850 u32 stripesize;
851
852 u32 type;
853 u64 highest_inode;
854 u64 last_inode_alloc;
855 int ref_cows;
856 int track_dirty;
857 u64 defrag_trans_start;
858 struct btrfs_key defrag_progress;
859 struct btrfs_key defrag_max;
860 int defrag_running;
861 int defrag_level;
862 char *name;
863 int in_sysfs;
864
865 /* the dirty list is only used by non-reference counted roots */
866 struct list_head dirty_list;
867
868 spinlock_t list_lock;
869 struct list_head dead_list;
870 struct list_head orphan_list;
871
872 /*
873 * right now this just gets used so that a root has its own devid
874 * for stat. It may be used for more later
875 */
876 struct super_block anon_super;
877};
878
879/*
880
881 * inode items have the data typically returned from stat and store other
882 * info about object characteristics. There is one for every file and dir in
883 * the FS
884 */
885#define BTRFS_INODE_ITEM_KEY 1
886#define BTRFS_INODE_REF_KEY 12
887#define BTRFS_XATTR_ITEM_KEY 24
888#define BTRFS_ORPHAN_ITEM_KEY 48
889/* reserve 2-15 close to the inode for later flexibility */
890
891/*
892 * dir items are the name -> inode pointers in a directory. There is one
893 * for every name in a directory.
894 */
895#define BTRFS_DIR_LOG_ITEM_KEY 60
896#define BTRFS_DIR_LOG_INDEX_KEY 72
897#define BTRFS_DIR_ITEM_KEY 84
898#define BTRFS_DIR_INDEX_KEY 96
899/*
900 * extent data is for file data
901 */
902#define BTRFS_EXTENT_DATA_KEY 108
903
904/*
905 * extent csums are stored in a separate tree and hold csums for
906 * an entire extent on disk.
907 */
908#define BTRFS_EXTENT_CSUM_KEY 128
909
910/*
911 * root items point to tree roots. There are typically in the root
912 * tree used by the super block to find all the other trees
913 */
914#define BTRFS_ROOT_ITEM_KEY 132
915
916/*
917 * root backrefs tie subvols and snapshots to the directory entries that
918 * reference them
919 */
920#define BTRFS_ROOT_BACKREF_KEY 144
921
922/*
923 * root refs make a fast index for listing all of the snapshots and
924 * subvolumes referenced by a given root. They point directly to the
925 * directory item in the root that references the subvol
926 */
927#define BTRFS_ROOT_REF_KEY 156
928
929/*
930 * extent items are in the extent map tree. These record which blocks
931 * are used, and how many references there are to each block
932 */
933#define BTRFS_EXTENT_ITEM_KEY 168
934#define BTRFS_EXTENT_REF_KEY 180
935
936/*
937 * block groups give us hints into the extent allocation trees. Which
938 * blocks are free etc etc
939 */
940#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
941
942#define BTRFS_DEV_EXTENT_KEY 204
943#define BTRFS_DEV_ITEM_KEY 216
944#define BTRFS_CHUNK_ITEM_KEY 228
945
946/*
947 * string items are for debugging. They just store a short string of
948 * data in the FS
949 */
950#define BTRFS_STRING_ITEM_KEY 253
951
952#define BTRFS_MOUNT_NODATASUM (1 << 0)
953#define BTRFS_MOUNT_NODATACOW (1 << 1)
954#define BTRFS_MOUNT_NOBARRIER (1 << 2)
955#define BTRFS_MOUNT_SSD (1 << 3)
956#define BTRFS_MOUNT_DEGRADED (1 << 4)
957#define BTRFS_MOUNT_COMPRESS (1 << 5)
958
959#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
960#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
961#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
962 BTRFS_MOUNT_##opt)
963/*
964 * Inode flags
965 */
966#define BTRFS_INODE_NODATASUM (1 << 0)
967#define BTRFS_INODE_NODATACOW (1 << 1)
968#define BTRFS_INODE_READONLY (1 << 2)
969#define BTRFS_INODE_NOCOMPRESS (1 << 3)
970#define BTRFS_INODE_PREALLOC (1 << 4)
971#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
972 ~BTRFS_INODE_##flag)
973#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
974 BTRFS_INODE_##flag)
975#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
976 BTRFS_INODE_##flag)
977/* some macros to generate set/get funcs for the struct fields. This
978 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
979 * one for u8:
980 */
981#define le8_to_cpu(v) (v)
982#define cpu_to_le8(v) (v)
983#define __le8 u8
984
985#define read_eb_member(eb, ptr, type, member, result) ( \
986 read_extent_buffer(eb, (char *)(result), \
987 ((unsigned long)(ptr)) + \
988 offsetof(type, member), \
989 sizeof(((type *)0)->member)))
990
991#define write_eb_member(eb, ptr, type, member, result) ( \
992 write_extent_buffer(eb, (char *)(result), \
993 ((unsigned long)(ptr)) + \
994 offsetof(type, member), \
995 sizeof(((type *)0)->member)))
996
997#ifndef BTRFS_SETGET_FUNCS
998#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
999u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
1000void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1001#endif
1002
1003#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1004static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1005{ \
1006 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1007 u##bits res = le##bits##_to_cpu(p->member); \
1008 kunmap_atomic(p, KM_USER0); \
1009 return res; \
1010} \
1011static inline void btrfs_set_##name(struct extent_buffer *eb, \
1012 u##bits val) \
1013{ \
1014 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1015 p->member = cpu_to_le##bits(val); \
1016 kunmap_atomic(p, KM_USER0); \
1017}
1018
1019#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
1020static inline u##bits btrfs_##name(type *s) \
1021{ \
1022 return le##bits##_to_cpu(s->member); \
1023} \
1024static inline void btrfs_set_##name(type *s, u##bits val) \
1025{ \
1026 s->member = cpu_to_le##bits(val); \
1027}
1028
1029BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
1030BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
1031BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
1032BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
1033BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
1034BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
1035 start_offset, 64);
1036BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
1037BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
1038BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
1039BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
1040BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
1041BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
1042
1043BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
1044BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
1045 total_bytes, 64);
1046BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
1047 bytes_used, 64);
1048BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
1049 io_align, 32);
1050BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
1051 io_width, 32);
1052BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
1053 sector_size, 32);
1054BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
1055BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
1056 dev_group, 32);
1057BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
1058 seek_speed, 8);
1059BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
1060 bandwidth, 8);
1061BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
1062 generation, 64);
1063
1064static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
1065{
1066 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
1067}
1068
1069static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
1070{
1071 return (char *)d + offsetof(struct btrfs_dev_item, fsid);
1072}
1073
1074BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
1075BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
1076BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
1077BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
1078BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
1079BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
1080BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
1081BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
1082BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
1083BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
1084BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
1085
1086static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
1087{
1088 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
1089}
1090
1091BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
1092BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
1093BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
1094 stripe_len, 64);
1095BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
1096 io_align, 32);
1097BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
1098 io_width, 32);
1099BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
1100 sector_size, 32);
1101BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
1102BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
1103 num_stripes, 16);
1104BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
1105 sub_stripes, 16);
1106BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
1107BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
1108
1109static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
1110 int nr)
1111{
1112 unsigned long offset = (unsigned long)c;
1113 offset += offsetof(struct btrfs_chunk, stripe);
1114 offset += nr * sizeof(struct btrfs_stripe);
1115 return (struct btrfs_stripe *)offset;
1116}
1117
1118static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
1119{
1120 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
1121}
1122
1123static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1124 struct btrfs_chunk *c, int nr)
1125{
1126 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1127}
1128
1129static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1130 struct btrfs_chunk *c, int nr,
1131 u64 val)
1132{
1133 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1134}
1135
1136static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1137 struct btrfs_chunk *c, int nr)
1138{
1139 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1140}
1141
1142static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1143 struct btrfs_chunk *c, int nr,
1144 u64 val)
1145{
1146 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1147}
1148
1149/* struct btrfs_block_group_item */
1150BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1151 used, 64);
1152BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1153 used, 64);
1154BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1155 struct btrfs_block_group_item, chunk_objectid, 64);
1156
1157BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1158 struct btrfs_block_group_item, chunk_objectid, 64);
1159BTRFS_SETGET_FUNCS(disk_block_group_flags,
1160 struct btrfs_block_group_item, flags, 64);
1161BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1162 struct btrfs_block_group_item, flags, 64);
1163
1164/* struct btrfs_inode_ref */
1165BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1166BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1167
1168/* struct btrfs_inode_item */
1169BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1170BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
1171BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1172BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1173BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1174BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1175BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1176BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1177BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1178BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1179BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1180BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
1181
1182static inline struct btrfs_timespec *
1183btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1184{
1185 unsigned long ptr = (unsigned long)inode_item;
1186 ptr += offsetof(struct btrfs_inode_item, atime);
1187 return (struct btrfs_timespec *)ptr;
1188}
1189
1190static inline struct btrfs_timespec *
1191btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1192{
1193 unsigned long ptr = (unsigned long)inode_item;
1194 ptr += offsetof(struct btrfs_inode_item, mtime);
1195 return (struct btrfs_timespec *)ptr;
1196}
1197
1198static inline struct btrfs_timespec *
1199btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1200{
1201 unsigned long ptr = (unsigned long)inode_item;
1202 ptr += offsetof(struct btrfs_inode_item, ctime);
1203 return (struct btrfs_timespec *)ptr;
1204}
1205
1206static inline struct btrfs_timespec *
1207btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1208{
1209 unsigned long ptr = (unsigned long)inode_item;
1210 ptr += offsetof(struct btrfs_inode_item, otime);
1211 return (struct btrfs_timespec *)ptr;
1212}
1213
1214BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1215BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1216
1217/* struct btrfs_dev_extent */
1218BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1219 chunk_tree, 64);
1220BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1221 chunk_objectid, 64);
1222BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1223 chunk_offset, 64);
1224BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1225
1226static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1227{
1228 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1229 return (u8 *)((unsigned long)dev + ptr);
1230}
1231
1232/* struct btrfs_extent_ref */
1233BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1234BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1235BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1236BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1237
1238BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1239BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1240 generation, 64);
1241BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1242 objectid, 64);
1243BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1244 num_refs, 32);
1245
1246/* struct btrfs_extent_item */
1247BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1248BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1249 refs, 32);
1250
1251/* struct btrfs_node */
1252BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1253BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1254
1255static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1256{
1257 unsigned long ptr;
1258 ptr = offsetof(struct btrfs_node, ptrs) +
1259 sizeof(struct btrfs_key_ptr) * nr;
1260 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1261}
1262
1263static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1264 int nr, u64 val)
1265{
1266 unsigned long ptr;
1267 ptr = offsetof(struct btrfs_node, ptrs) +
1268 sizeof(struct btrfs_key_ptr) * nr;
1269 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1270}
1271
1272static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1273{
1274 unsigned long ptr;
1275 ptr = offsetof(struct btrfs_node, ptrs) +
1276 sizeof(struct btrfs_key_ptr) * nr;
1277 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1278}
1279
1280static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1281 int nr, u64 val)
1282{
1283 unsigned long ptr;
1284 ptr = offsetof(struct btrfs_node, ptrs) +
1285 sizeof(struct btrfs_key_ptr) * nr;
1286 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1287}
1288
1289static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1290{
1291 return offsetof(struct btrfs_node, ptrs) +
1292 sizeof(struct btrfs_key_ptr) * nr;
1293}
1294
1295void btrfs_node_key(struct extent_buffer *eb,
1296 struct btrfs_disk_key *disk_key, int nr);
1297
1298static inline void btrfs_set_node_key(struct extent_buffer *eb,
1299 struct btrfs_disk_key *disk_key, int nr)
1300{
1301 unsigned long ptr;
1302 ptr = btrfs_node_key_ptr_offset(nr);
1303 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1304 struct btrfs_key_ptr, key, disk_key);
1305}
1306
1307/* struct btrfs_item */
1308BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1309BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1310
1311static inline unsigned long btrfs_item_nr_offset(int nr)
1312{
1313 return offsetof(struct btrfs_leaf, items) +
1314 sizeof(struct btrfs_item) * nr;
1315}
1316
1317static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1318 int nr)
1319{
1320 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1321}
1322
1323static inline u32 btrfs_item_end(struct extent_buffer *eb,
1324 struct btrfs_item *item)
1325{
1326 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1327}
1328
1329static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1330{
1331 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1332}
1333
1334static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1335{
1336 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1337}
1338
1339static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1340{
1341 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1342}
1343
1344static inline void btrfs_item_key(struct extent_buffer *eb,
1345 struct btrfs_disk_key *disk_key, int nr)
1346{
1347 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1348 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1349}
1350
1351static inline void btrfs_set_item_key(struct extent_buffer *eb,
1352 struct btrfs_disk_key *disk_key, int nr)
1353{
1354 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1355 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1356}
1357
1358BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1359
1360/*
1361 * struct btrfs_root_ref
1362 */
1363BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
1364BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
1365BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
1366
1367/* struct btrfs_dir_item */
1368BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1369BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1370BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1371BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1372
1373static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1374 struct btrfs_dir_item *item,
1375 struct btrfs_disk_key *key)
1376{
1377 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1378}
1379
1380static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1381 struct btrfs_dir_item *item,
1382 struct btrfs_disk_key *key)
1383{
1384 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1385}
1386
1387/* struct btrfs_disk_key */
1388BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1389 objectid, 64);
1390BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1391BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1392
1393static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1394 struct btrfs_disk_key *disk)
1395{
1396 cpu->offset = le64_to_cpu(disk->offset);
1397 cpu->type = disk->type;
1398 cpu->objectid = le64_to_cpu(disk->objectid);
1399}
1400
1401static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1402 struct btrfs_key *cpu)
1403{
1404 disk->offset = cpu_to_le64(cpu->offset);
1405 disk->type = cpu->type;
1406 disk->objectid = cpu_to_le64(cpu->objectid);
1407}
1408
1409static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1410 struct btrfs_key *key, int nr)
1411{
1412 struct btrfs_disk_key disk_key;
1413 btrfs_node_key(eb, &disk_key, nr);
1414 btrfs_disk_key_to_cpu(key, &disk_key);
1415}
1416
1417static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1418 struct btrfs_key *key, int nr)
1419{
1420 struct btrfs_disk_key disk_key;
1421 btrfs_item_key(eb, &disk_key, nr);
1422 btrfs_disk_key_to_cpu(key, &disk_key);
1423}
1424
1425static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1426 struct btrfs_dir_item *item,
1427 struct btrfs_key *key)
1428{
1429 struct btrfs_disk_key disk_key;
1430 btrfs_dir_item_key(eb, item, &disk_key);
1431 btrfs_disk_key_to_cpu(key, &disk_key);
1432}
1433
1434
1435static inline u8 btrfs_key_type(struct btrfs_key *key)
1436{
1437 return key->type;
1438}
1439
1440static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1441{
1442 key->type = val;
1443}
1444
1445/* struct btrfs_header */
1446BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1447BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1448 generation, 64);
1449BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1450BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1451BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1452BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1453
1454static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1455{
1456 return (btrfs_header_flags(eb) & flag) == flag;
1457}
1458
1459static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1460{
1461 u64 flags = btrfs_header_flags(eb);
1462 btrfs_set_header_flags(eb, flags | flag);
1463 return (flags & flag) == flag;
1464}
1465
1466static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1467{
1468 u64 flags = btrfs_header_flags(eb);
1469 btrfs_set_header_flags(eb, flags & ~flag);
1470 return (flags & flag) == flag;
1471}
1472
1473static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1474{
1475 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1476 return (u8 *)ptr;
1477}
1478
1479static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1480{
1481 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1482 return (u8 *)ptr;
1483}
1484
1485static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1486{
1487 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1488 return (u8 *)ptr;
1489}
1490
1491static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1492{
1493 unsigned long ptr = offsetof(struct btrfs_header, csum);
1494 return (u8 *)ptr;
1495}
1496
1497static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1498{
1499 return NULL;
1500}
1501
1502static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1503{
1504 return NULL;
1505}
1506
1507static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1508{
1509 return NULL;
1510}
1511
1512static inline int btrfs_is_leaf(struct extent_buffer *eb)
1513{
1514 return btrfs_header_level(eb) == 0;
1515}
1516
1517/* struct btrfs_root_item */
1518BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
1519 generation, 64);
1520BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1521BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1522BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1523
1524BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
1525 generation, 64);
1526BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1527BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1528BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1529BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1530BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
1531BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1532BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1533BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1534 last_snapshot, 64);
1535
1536/* struct btrfs_super_block */
1537
1538BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1539BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1540BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1541 generation, 64);
1542BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1543BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1544 struct btrfs_super_block, sys_chunk_array_size, 32);
1545BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
1546 struct btrfs_super_block, chunk_root_generation, 64);
1547BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1548 root_level, 8);
1549BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1550 chunk_root, 64);
1551BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1552 chunk_root_level, 8);
1553BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1554 log_root, 64);
1555BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
1556 log_root_transid, 64);
1557BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1558 log_root_level, 8);
1559BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1560 total_bytes, 64);
1561BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1562 bytes_used, 64);
1563BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1564 sectorsize, 32);
1565BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1566 nodesize, 32);
1567BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1568 leafsize, 32);
1569BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1570 stripesize, 32);
1571BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1572 root_dir_objectid, 64);
1573BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1574 num_devices, 64);
1575BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1576 compat_flags, 64);
1577BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1578 compat_flags, 64);
1579BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1580 incompat_flags, 64);
1581BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1582 csum_type, 16);
1583
1584static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1585{
1586 int t = btrfs_super_csum_type(s);
1587 BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
1588 return btrfs_csum_sizes[t];
1589}
1590
1591static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1592{
1593 return offsetof(struct btrfs_leaf, items);
1594}
1595
1596/* struct btrfs_file_extent_item */
1597BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1598
1599static inline unsigned long
1600btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
1601{
1602 unsigned long offset = (unsigned long)e;
1603 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1604 return offset;
1605}
1606
1607static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1608{
1609 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1610}
1611
1612BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1613 disk_bytenr, 64);
1614BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1615 generation, 64);
1616BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1617 disk_num_bytes, 64);
1618BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1619 offset, 64);
1620BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1621 num_bytes, 64);
1622BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1623 ram_bytes, 64);
1624BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1625 compression, 8);
1626BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1627 encryption, 8);
1628BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1629 other_encoding, 16);
1630
1631/* this returns the number of file bytes represented by the inline item.
1632 * If an item is compressed, this is the uncompressed size
1633 */
1634static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1635 struct btrfs_file_extent_item *e)
1636{
1637 return btrfs_file_extent_ram_bytes(eb, e);
1638}
1639
1640/*
1641 * this returns the number of bytes used by the item on disk, minus the
1642 * size of any extent headers. If a file is compressed on disk, this is
1643 * the compressed size
1644 */
1645static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1646 struct btrfs_item *e)
1647{
1648 unsigned long offset;
1649 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1650 return btrfs_item_size(eb, e) - offset;
1651}
1652
1653static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1654{
1655 return sb->s_fs_info;
1656}
1657
1658static inline int btrfs_set_root_name(struct btrfs_root *root,
1659 const char *name, int len)
1660{
1661 /* if we already have a name just free it */
1662 kfree(root->name);
1663
1664 root->name = kmalloc(len+1, GFP_KERNEL);
1665 if (!root->name)
1666 return -ENOMEM;
1667
1668 memcpy(root->name, name, len);
1669 root->name[len] = '\0';
1670
1671 return 0;
1672}
1673
1674static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
1675{
1676 if (level == 0)
1677 return root->leafsize;
1678 return root->nodesize;
1679}
1680
1681/* helper function to cast into the data area of the leaf. */
1682#define btrfs_item_ptr(leaf, slot, type) \
1683 ((type *)(btrfs_leaf_data(leaf) + \
1684 btrfs_item_offset_nr(leaf, slot)))
1685
1686#define btrfs_item_ptr_offset(leaf, slot) \
1687 ((unsigned long)(btrfs_leaf_data(leaf) + \
1688 btrfs_item_offset_nr(leaf, slot)))
1689
1690static inline struct dentry *fdentry(struct file *file)
1691{
1692 return file->f_path.dentry;
1693}
1694
1695/* extent-tree.c */
1696int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1697int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1698 struct btrfs_root *root, u64 bytenr,
1699 u64 num_bytes, u32 *refs);
1700int btrfs_update_pinned_extents(struct btrfs_root *root,
1701 u64 bytenr, u64 num, int pin);
1702int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1703 struct btrfs_root *root, struct extent_buffer *leaf);
1704int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *root, u64 objectid, u64 bytenr);
1706int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root);
1708int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1709struct btrfs_block_group_cache *btrfs_lookup_block_group(
1710 struct btrfs_fs_info *info,
1711 u64 bytenr);
1712u64 btrfs_find_block_group(struct btrfs_root *root,
1713 u64 search_start, u64 search_hint, int owner);
1714struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1715 struct btrfs_root *root,
1716 u32 blocksize, u64 parent,
1717 u64 root_objectid,
1718 u64 ref_generation,
1719 int level,
1720 u64 hint,
1721 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes,
1728 u64 root_objectid, u64 ref_generation,
1729 u64 owner, u64 empty_size, u64 hint_byte,
1730 u64 search_end, struct btrfs_key *ins, u64 data);
1731int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1732 struct btrfs_root *root, u64 parent,
1733 u64 root_objectid, u64 ref_generation,
1734 u64 owner, struct btrfs_key *ins);
1735int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1736 struct btrfs_root *root, u64 parent,
1737 u64 root_objectid, u64 ref_generation,
1738 u64 owner, struct btrfs_key *ins);
1739int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1740 struct btrfs_root *root,
1741 u64 num_bytes, u64 min_alloc_size,
1742 u64 empty_size, u64 hint_byte,
1743 u64 search_end, struct btrfs_key *ins,
1744 u64 data);
1745int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1746 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1747 u32 *nr_extents);
1748int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1749 struct extent_buffer *buf, u32 nr_extents);
1750int btrfs_update_ref(struct btrfs_trans_handle *trans,
1751 struct btrfs_root *root, struct extent_buffer *orig_buf,
1752 struct extent_buffer *buf, int start_slot, int nr);
1753int btrfs_free_extent(struct btrfs_trans_handle *trans,
1754 struct btrfs_root *root,
1755 u64 bytenr, u64 num_bytes, u64 parent,
1756 u64 root_objectid, u64 ref_generation,
1757 u64 owner_objectid, int pin);
1758int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1759int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1760 struct btrfs_root *root,
1761 struct extent_io_tree *unpin);
1762int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root,
1764 u64 bytenr, u64 num_bytes, u64 parent,
1765 u64 root_objectid, u64 ref_generation,
1766 u64 owner_objectid);
1767int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root, u64 bytenr,
1769 u64 orig_parent, u64 parent,
1770 u64 root_objectid, u64 ref_generation,
1771 u64 owner_objectid);
1772int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root);
1774int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1775int btrfs_free_block_groups(struct btrfs_fs_info *info);
1776int btrfs_read_block_groups(struct btrfs_root *root);
1777int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1778 struct btrfs_root *root, u64 bytes_used,
1779 u64 type, u64 chunk_objectid, u64 chunk_offset,
1780 u64 size);
1781int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1782 struct btrfs_root *root, u64 group_start);
1783int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1784int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1785 struct btrfs_root *root);
1786int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1787int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1788 struct btrfs_root *root,
1789 struct extent_buffer *buf, u64 orig_start);
1790int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1794/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid,
1797 int type);
1798int btrfs_merge_path(struct btrfs_trans_handle *trans,
1799 struct btrfs_root *root,
1800 struct btrfs_key *node_keys,
1801 u64 *nodes, int lowest_level);
1802int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1803 struct btrfs_root *root, struct btrfs_path *path,
1804 struct btrfs_key *new_key);
1805struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1806struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1807int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1808 struct btrfs_key *key, int lowest_level,
1809 int cache_only, u64 min_trans);
1810int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1811 struct btrfs_key *max_key,
1812 struct btrfs_path *path, int cache_only,
1813 u64 min_trans);
1814int btrfs_cow_block(struct btrfs_trans_handle *trans,
1815 struct btrfs_root *root, struct extent_buffer *buf,
1816 struct extent_buffer *parent, int parent_slot,
1817 struct extent_buffer **cow_ret, u64 prealloc_dest);
1818int btrfs_copy_root(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct extent_buffer *buf,
1821 struct extent_buffer **cow_ret, u64 new_root_objectid);
1822int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1823 *root, struct btrfs_path *path, u32 data_size);
1824int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1825 struct btrfs_root *root,
1826 struct btrfs_path *path,
1827 u32 new_size, int from_end);
1828int btrfs_split_item(struct btrfs_trans_handle *trans,
1829 struct btrfs_root *root,
1830 struct btrfs_path *path,
1831 struct btrfs_key *new_key,
1832 unsigned long split_offset);
1833int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1834 *root, struct btrfs_key *key, struct btrfs_path *p, int
1835 ins_len, int cow);
1836int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1837 struct btrfs_root *root, struct extent_buffer *parent,
1838 int start_slot, int cache_only, u64 *last_ret,
1839 struct btrfs_key *progress);
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p);
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root,
1848 struct btrfs_path *path, u64 bytenr);
1849static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root,
1851 struct btrfs_path *path)
1852{
1853 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1854}
1855
1856int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1857 *root, struct btrfs_key *key, void *data, u32 data_size);
1858int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
1859 struct btrfs_root *root,
1860 struct btrfs_path *path,
1861 struct btrfs_key *cpu_key, u32 *data_size,
1862 int nr);
1863int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1864 struct btrfs_root *root,
1865 struct btrfs_path *path,
1866 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1867
1868static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root,
1870 struct btrfs_path *path,
1871 struct btrfs_key *key,
1872 u32 data_size)
1873{
1874 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1875}
1876
1877int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1878int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1879int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1880int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1881 *root);
1882int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1883 struct btrfs_root *root,
1884 struct extent_buffer *node,
1885 struct extent_buffer *parent);
1886/* root-item.c */
1887int btrfs_find_root_ref(struct btrfs_root *tree_root,
1888 struct btrfs_path *path,
1889 u64 root_id, u64 ref_id);
1890int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
1891 struct btrfs_root *tree_root,
1892 u64 root_id, u8 type, u64 ref_id,
1893 u64 dirid, u64 sequence,
1894 const char *name, int name_len);
1895int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1896 struct btrfs_key *key);
1897int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1898 *root, struct btrfs_key *key, struct btrfs_root_item
1899 *item);
1900int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1901 *root, struct btrfs_key *key, struct btrfs_root_item
1902 *item);
1903int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1904 btrfs_root_item *item, struct btrfs_key *key);
1905int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1906 u64 *found_objectid);
1907int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1908 struct btrfs_root *latest_root);
1909/* dir-item.c */
1910int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
1911 struct btrfs_root *root, const char *name,
1912 int name_len, u64 dir,
1913 struct btrfs_key *location, u8 type, u64 index);
1914struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1915 struct btrfs_root *root,
1916 struct btrfs_path *path, u64 dir,
1917 const char *name, int name_len,
1918 int mod);
1919struct btrfs_dir_item *
1920btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root,
1922 struct btrfs_path *path, u64 dir,
1923 u64 objectid, const char *name, int name_len,
1924 int mod);
1925struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1926 struct btrfs_path *path,
1927 const char *name, int name_len);
1928int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1929 struct btrfs_root *root,
1930 struct btrfs_path *path,
1931 struct btrfs_dir_item *di);
1932int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root, const char *name,
1934 u16 name_len, const void *data, u16 data_len,
1935 u64 dir);
1936struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1937 struct btrfs_root *root,
1938 struct btrfs_path *path, u64 dir,
1939 const char *name, u16 name_len,
1940 int mod);
1941
1942/* orphan.c */
1943int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1944 struct btrfs_root *root, u64 offset);
1945int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1946 struct btrfs_root *root, u64 offset);
1947
1948/* inode-map.c */
1949int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1950 struct btrfs_root *fs_root,
1951 u64 dirid, u64 *objectid);
1952int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1953
1954/* inode-item.c */
1955int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1956 struct btrfs_root *root,
1957 const char *name, int name_len,
1958 u64 inode_objectid, u64 ref_objectid, u64 index);
1959int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1960 struct btrfs_root *root,
1961 const char *name, int name_len,
1962 u64 inode_objectid, u64 ref_objectid, u64 *index);
1963int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1964 struct btrfs_root *root,
1965 struct btrfs_path *path, u64 objectid);
1966int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1967 *root, struct btrfs_path *path,
1968 struct btrfs_key *location, int mod);
1969
1970/* file-item.c */
1971int btrfs_del_csums(struct btrfs_trans_handle *trans,
1972 struct btrfs_root *root, u64 bytenr, u64 len);
1973int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1974 struct bio *bio, u32 *dst);
1975int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1976 struct btrfs_root *root,
1977 u64 objectid, u64 pos,
1978 u64 disk_offset, u64 disk_num_bytes,
1979 u64 num_bytes, u64 offset, u64 ram_bytes,
1980 u8 compression, u8 encryption, u16 other_encoding);
1981int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1982 struct btrfs_root *root,
1983 struct btrfs_path *path, u64 objectid,
1984 u64 bytenr, int mod);
1985int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1986 struct btrfs_root *root,
1987 struct btrfs_ordered_sum *sums);
1988int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1989 struct bio *bio, u64 file_start, int contig);
1990int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1991 u64 start, unsigned long len);
1992struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1993 struct btrfs_root *root,
1994 struct btrfs_path *path,
1995 u64 bytenr, int cow);
1996int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, struct btrfs_path *path,
1998 u64 isize);
1999int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
2000 u64 end, struct list_head *list);
2001/* inode.c */
2002
2003/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
2004#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
2005#define ClearPageChecked ClearPageFsMisc
2006#define SetPageChecked SetPageFsMisc
2007#define PageChecked PageFsMisc
2008#endif
2009
2010struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2011int btrfs_set_inode_index(struct inode *dir, u64 *index);
2012int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2013 struct btrfs_root *root,
2014 struct inode *dir, struct inode *inode,
2015 const char *name, int name_len);
2016int btrfs_add_link(struct btrfs_trans_handle *trans,
2017 struct inode *parent_inode, struct inode *inode,
2018 const char *name, int name_len, int add_backref, u64 index);
2019int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2020 struct btrfs_root *root,
2021 struct inode *inode, u64 new_size,
2022 u32 min_type);
2023
2024int btrfs_start_delalloc_inodes(struct btrfs_root *root);
2025int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2026int btrfs_writepages(struct address_space *mapping,
2027 struct writeback_control *wbc);
2028int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2029 struct btrfs_root *new_root, struct dentry *dentry,
2030 u64 new_dirid, u64 alloc_hint);
2031int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2032 size_t size, struct bio *bio, unsigned long bio_flags);
2033
2034unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode);
2042void btrfs_put_inode(struct inode *inode);
2043void btrfs_read_locked_inode(struct inode *inode);
2044int btrfs_write_inode(struct inode *inode, int wait);
2045void btrfs_dirty_inode(struct inode *inode);
2046struct inode *btrfs_alloc_inode(struct super_block *sb);
2047void btrfs_destroy_inode(struct inode *inode);
2048int btrfs_init_cachep(void);
2049void btrfs_destroy_cachep(void);
2050long btrfs_ioctl_trans_end(struct file *file);
2051struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2052 struct btrfs_root *root, int wait);
2053struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2054 struct btrfs_root *root);
2055struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2056 struct btrfs_root *root, int *is_new);
2057int btrfs_commit_write(struct file *file, struct page *page,
2058 unsigned from, unsigned to);
2059struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2060 size_t page_offset, u64 start, u64 end,
2061 int create);
2062int btrfs_update_inode(struct btrfs_trans_handle *trans,
2063 struct btrfs_root *root,
2064 struct inode *inode);
2065int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2066int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2067void btrfs_orphan_cleanup(struct btrfs_root *root);
2068int btrfs_cont_expand(struct inode *inode, loff_t size);
2069
2070/* ioctl.c */
2071long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2072
2073/* file.c */
2074int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2075int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2076 int skip_pinned);
2077int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2078extern struct file_operations btrfs_file_operations;
2079int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2080 struct btrfs_root *root, struct inode *inode,
2081 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
2082int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2083 struct btrfs_root *root,
2084 struct inode *inode, u64 start, u64 end);
2085int btrfs_release_file(struct inode *inode, struct file *file);
2086
2087/* tree-defrag.c */
2088int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2089 struct btrfs_root *root, int cache_only);
2090
2091/* sysfs.c */
2092int btrfs_init_sysfs(void);
2093void btrfs_exit_sysfs(void);
2094int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2095int btrfs_sysfs_add_root(struct btrfs_root *root);
2096void btrfs_sysfs_del_root(struct btrfs_root *root);
2097void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2098
2099/* xattr.c */
2100ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2101
2102/* super.c */
2103u64 btrfs_parse_size(char *str);
2104int btrfs_parse_options(struct btrfs_root *root, char *options);
2105int btrfs_sync_fs(struct super_block *sb, int wait);
2106
2107/* acl.c */
2108int btrfs_check_acl(struct inode *inode, int mask);
2109int btrfs_init_acl(struct inode *inode, struct inode *dir);
2110int btrfs_acl_chmod(struct inode *inode);
2111
2112/* free-space-cache.c */
2113int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2114 u64 bytenr, u64 size);
2115int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2116 u64 offset, u64 bytes);
2117int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2118 u64 bytenr, u64 size);
2119int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2120 u64 offset, u64 bytes);
2121void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2122 *block_group);
2123struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2124 *block_group, u64 offset,
2125 u64 bytes);
2126void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2127 u64 bytes);
2128u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2129#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while (cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h>
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29#include <linux/freezer.h>
30#include "compat.h"
31#include "crc32c.h"
32#include "ctree.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "btrfs_inode.h"
36#include "volumes.h"
37#include "print-tree.h"
38#include "async-thread.h"
39#include "locking.h"
40#include "ref-cache.h"
41#include "tree-log.h"
42
43static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work);
45
46/*
47 * end_io_wq structs are used to do processing in task context when an IO is
48 * complete. This is used during reads to verify checksums, and it is used
49 * by writes to insert metadata for new file extents after IO is complete.
50 */
51struct end_io_wq {
52 struct bio *bio;
53 bio_end_io_t *end_io;
54 void *private;
55 struct btrfs_fs_info *info;
56 int error;
57 int metadata;
58 struct list_head list;
59 struct btrfs_work work;
60};
61
62/*
63 * async submit bios are used to offload expensive checksumming
64 * onto the worker threads. They checksum file and metadata bios
65 * just before they are sent down the IO stack.
66 */
67struct async_submit_bio {
68 struct inode *inode;
69 struct bio *bio;
70 struct list_head list;
71 extent_submit_bio_hook_t *submit_bio_start;
72 extent_submit_bio_hook_t *submit_bio_done;
73 int rw;
74 int mirror_num;
75 unsigned long bio_flags;
76 struct btrfs_work work;
77};
78
79/*
80 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device
82 */
83static struct extent_map *btree_get_extent(struct inode *inode,
84 struct page *page, size_t page_offset, u64 start, u64 len,
85 int create)
86{
87 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
88 struct extent_map *em;
89 int ret;
90
91 spin_lock(&em_tree->lock);
92 em = lookup_extent_mapping(em_tree, start, len);
93 if (em) {
94 em->bdev =
95 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
96 spin_unlock(&em_tree->lock);
97 goto out;
98 }
99 spin_unlock(&em_tree->lock);
100
101 em = alloc_extent_map(GFP_NOFS);
102 if (!em) {
103 em = ERR_PTR(-ENOMEM);
104 goto out;
105 }
106 em->start = 0;
107 em->len = (u64)-1;
108 em->block_len = (u64)-1;
109 em->block_start = 0;
110 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
111
112 spin_lock(&em_tree->lock);
113 ret = add_extent_mapping(em_tree, em);
114 if (ret == -EEXIST) {
115 u64 failed_start = em->start;
116 u64 failed_len = em->len;
117
118 free_extent_map(em);
119 em = lookup_extent_mapping(em_tree, start, len);
120 if (em) {
121 ret = 0;
122 } else {
123 em = lookup_extent_mapping(em_tree, failed_start,
124 failed_len);
125 ret = -EIO;
126 }
127 } else if (ret) {
128 free_extent_map(em);
129 em = NULL;
130 }
131 spin_unlock(&em_tree->lock);
132
133 if (ret)
134 em = ERR_PTR(ret);
135out:
136 return em;
137}
138
139u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
140{
141 return btrfs_crc32c(seed, data, len);
142}
143
144void btrfs_csum_final(u32 crc, char *result)
145{
146 *(__le32 *)result = ~cpu_to_le32(crc);
147}
148
149/*
150 * compute the csum for a btree block, and either verify it or write it
151 * into the csum field of the block.
152 */
153static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
154 int verify)
155{
156 u16 csum_size =
157 btrfs_super_csum_size(&root->fs_info->super_copy);
158 char *result = NULL;
159 unsigned long len;
160 unsigned long cur_len;
161 unsigned long offset = BTRFS_CSUM_SIZE;
162 char *map_token = NULL;
163 char *kaddr;
164 unsigned long map_start;
165 unsigned long map_len;
166 int err;
167 u32 crc = ~(u32)0;
168 unsigned long inline_result;
169
170 len = buf->len - offset;
171 while (len > 0) {
172 err = map_private_extent_buffer(buf, offset, 32,
173 &map_token, &kaddr,
174 &map_start, &map_len, KM_USER0);
175 if (err)
176 return 1;
177 cur_len = min(len, map_len - (offset - map_start));
178 crc = btrfs_csum_data(root, kaddr + offset - map_start,
179 crc, cur_len);
180 len -= cur_len;
181 offset += cur_len;
182 unmap_extent_buffer(buf, map_token, KM_USER0);
183 }
184 if (csum_size > sizeof(inline_result)) {
185 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
186 if (!result)
187 return 1;
188 } else {
189 result = (char *)&inline_result;
190 }
191
192 btrfs_csum_final(crc, result);
193
194 if (verify) {
195 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
196 u32 val;
197 u32 found = 0;
198 memcpy(&found, result, csum_size);
199
200 read_extent_buffer(buf, &val, 0, csum_size);
201 printk(KERN_INFO "btrfs: %s checksum verify failed "
202 "on %llu wanted %X found %X level %d\n",
203 root->fs_info->sb->s_id,
204 buf->start, val, found, btrfs_header_level(buf));
205 if (result != (char *)&inline_result)
206 kfree(result);
207 return 1;
208 }
209 } else {
210 write_extent_buffer(buf, result, 0, csum_size);
211 }
212 if (result != (char *)&inline_result)
213 kfree(result);
214 return 0;
215}
216
217/*
218 * we can't consider a given block up to date unless the transid of the
219 * block matches the transid in the parent node's pointer. This is how we
220 * detect blocks that either didn't get written at all or got written
221 * in the wrong place.
222 */
223static int verify_parent_transid(struct extent_io_tree *io_tree,
224 struct extent_buffer *eb, u64 parent_transid)
225{
226 int ret;
227
228 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
229 return 0;
230
231 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
232 if (extent_buffer_uptodate(io_tree, eb) &&
233 btrfs_header_generation(eb) == parent_transid) {
234 ret = 0;
235 goto out;
236 }
237 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
238 (unsigned long long)eb->start,
239 (unsigned long long)parent_transid,
240 (unsigned long long)btrfs_header_generation(eb));
241 ret = 1;
242 clear_extent_buffer_uptodate(io_tree, eb);
243out:
244 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
245 GFP_NOFS);
246 return ret;
247}
248
249/*
250 * helper to read a given tree block, doing retries as required when
251 * the checksums don't match and we have alternate mirrors to try.
252 */
253static int btree_read_extent_buffer_pages(struct btrfs_root *root,
254 struct extent_buffer *eb,
255 u64 start, u64 parent_transid)
256{
257 struct extent_io_tree *io_tree;
258 int ret;
259 int num_copies = 0;
260 int mirror_num = 0;
261
262 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
263 while (1) {
264 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
265 btree_get_extent, mirror_num);
266 if (!ret &&
267 !verify_parent_transid(io_tree, eb, parent_transid))
268 return ret;
269
270 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
271 eb->start, eb->len);
272 if (num_copies == 1)
273 return ret;
274
275 mirror_num++;
276 if (mirror_num > num_copies)
277 return ret;
278 }
279 return -EIO;
280}
281
282/*
283 * checksum a dirty tree block before IO. This has extra checks to make sure
284 * we only fill in the checksum field in the first page of a multi-page block
285 */
286
287static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
288{
289 struct extent_io_tree *tree;
290 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
291 u64 found_start;
292 int found_level;
293 unsigned long len;
294 struct extent_buffer *eb;
295 int ret;
296
297 tree = &BTRFS_I(page->mapping->host)->io_tree;
298
299 if (page->private == EXTENT_PAGE_PRIVATE)
300 goto out;
301 if (!page->private)
302 goto out;
303 len = page->private >> 2;
304 WARN_ON(len == 0);
305
306 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
307 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
308 btrfs_header_generation(eb));
309 BUG_ON(ret);
310 found_start = btrfs_header_bytenr(eb);
311 if (found_start != start) {
312 WARN_ON(1);
313 goto err;
314 }
315 if (eb->first_page != page) {
316 WARN_ON(1);
317 goto err;
318 }
319 if (!PageUptodate(page)) {
320 WARN_ON(1);
321 goto err;
322 }
323 found_level = btrfs_header_level(eb);
324
325 csum_tree_block(root, eb, 0);
326err:
327 free_extent_buffer(eb);
328out:
329 return 0;
330}
331
332static int check_tree_block_fsid(struct btrfs_root *root,
333 struct extent_buffer *eb)
334{
335 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
336 u8 fsid[BTRFS_UUID_SIZE];
337 int ret = 1;
338
339 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
340 BTRFS_FSID_SIZE);
341 while (fs_devices) {
342 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
343 ret = 0;
344 break;
345 }
346 fs_devices = fs_devices->seed;
347 }
348 return ret;
349}
350
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state)
353{
354 struct extent_io_tree *tree;
355 u64 found_start;
356 int found_level;
357 unsigned long len;
358 struct extent_buffer *eb;
359 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
360 int ret = 0;
361
362 tree = &BTRFS_I(page->mapping->host)->io_tree;
363 if (page->private == EXTENT_PAGE_PRIVATE)
364 goto out;
365 if (!page->private)
366 goto out;
367
368 len = page->private >> 2;
369 WARN_ON(len == 0);
370
371 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
372
373 found_start = btrfs_header_bytenr(eb);
374 if (found_start != start) {
375 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
376 (unsigned long long)found_start,
377 (unsigned long long)eb->start);
378 ret = -EIO;
379 goto err;
380 }
381 if (eb->first_page != page) {
382 printk(KERN_INFO "btrfs bad first page %lu %lu\n",
383 eb->first_page->index, page->index);
384 WARN_ON(1);
385 ret = -EIO;
386 goto err;
387 }
388 if (check_tree_block_fsid(root, eb)) {
389 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
390 (unsigned long long)eb->start);
391 ret = -EIO;
392 goto err;
393 }
394 found_level = btrfs_header_level(eb);
395
396 ret = csum_tree_block(root, eb, 1);
397 if (ret)
398 ret = -EIO;
399
400 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
401 end = eb->start + end - 1;
402err:
403 free_extent_buffer(eb);
404out:
405 return ret;
406}
407
408static void end_workqueue_bio(struct bio *bio, int err)
409{
410 struct end_io_wq *end_io_wq = bio->bi_private;
411 struct btrfs_fs_info *fs_info;
412
413 fs_info = end_io_wq->info;
414 end_io_wq->error = err;
415 end_io_wq->work.func = end_workqueue_fn;
416 end_io_wq->work.flags = 0;
417
418 if (bio->bi_rw & (1 << BIO_RW)) {
419 if (end_io_wq->metadata)
420 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
421 &end_io_wq->work);
422 else
423 btrfs_queue_worker(&fs_info->endio_write_workers,
424 &end_io_wq->work);
425 } else {
426 if (end_io_wq->metadata)
427 btrfs_queue_worker(&fs_info->endio_meta_workers,
428 &end_io_wq->work);
429 else
430 btrfs_queue_worker(&fs_info->endio_workers,
431 &end_io_wq->work);
432 }
433}
434
435int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
436 int metadata)
437{
438 struct end_io_wq *end_io_wq;
439 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
440 if (!end_io_wq)
441 return -ENOMEM;
442
443 end_io_wq->private = bio->bi_private;
444 end_io_wq->end_io = bio->bi_end_io;
445 end_io_wq->info = info;
446 end_io_wq->error = 0;
447 end_io_wq->bio = bio;
448 end_io_wq->metadata = metadata;
449
450 bio->bi_private = end_io_wq;
451 bio->bi_end_io = end_workqueue_bio;
452 return 0;
453}
454
455unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
456{
457 unsigned long limit = min_t(unsigned long,
458 info->workers.max_workers,
459 info->fs_devices->open_devices);
460 return 256 * limit;
461}
462
463int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
464{
465 return atomic_read(&info->nr_async_bios) >
466 btrfs_async_submit_limit(info);
467}
468
469static void run_one_async_start(struct btrfs_work *work)
470{
471 struct btrfs_fs_info *fs_info;
472 struct async_submit_bio *async;
473
474 async = container_of(work, struct async_submit_bio, work);
475 fs_info = BTRFS_I(async->inode)->root->fs_info;
476 async->submit_bio_start(async->inode, async->rw, async->bio,
477 async->mirror_num, async->bio_flags);
478}
479
480static void run_one_async_done(struct btrfs_work *work)
481{
482 struct btrfs_fs_info *fs_info;
483 struct async_submit_bio *async;
484 int limit;
485
486 async = container_of(work, struct async_submit_bio, work);
487 fs_info = BTRFS_I(async->inode)->root->fs_info;
488
489 limit = btrfs_async_submit_limit(fs_info);
490 limit = limit * 2 / 3;
491
492 atomic_dec(&fs_info->nr_async_submits);
493
494 if (atomic_read(&fs_info->nr_async_submits) < limit &&
495 waitqueue_active(&fs_info->async_submit_wait))
496 wake_up(&fs_info->async_submit_wait);
497
498 async->submit_bio_done(async->inode, async->rw, async->bio,
499 async->mirror_num, async->bio_flags);
500}
501
502static void run_one_async_free(struct btrfs_work *work)
503{
504 struct async_submit_bio *async;
505
506 async = container_of(work, struct async_submit_bio, work);
507 kfree(async);
508}
509
510int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
511 int rw, struct bio *bio, int mirror_num,
512 unsigned long bio_flags,
513 extent_submit_bio_hook_t *submit_bio_start,
514 extent_submit_bio_hook_t *submit_bio_done)
515{
516 struct async_submit_bio *async;
517
518 async = kmalloc(sizeof(*async), GFP_NOFS);
519 if (!async)
520 return -ENOMEM;
521
522 async->inode = inode;
523 async->rw = rw;
524 async->bio = bio;
525 async->mirror_num = mirror_num;
526 async->submit_bio_start = submit_bio_start;
527 async->submit_bio_done = submit_bio_done;
528
529 async->work.func = run_one_async_start;
530 async->work.ordered_func = run_one_async_done;
531 async->work.ordered_free = run_one_async_free;
532
533 async->work.flags = 0;
534 async->bio_flags = bio_flags;
535
536 atomic_inc(&fs_info->nr_async_submits);
537 btrfs_queue_worker(&fs_info->workers, &async->work);
538#if 0
539 int limit = btrfs_async_submit_limit(fs_info);
540 if (atomic_read(&fs_info->nr_async_submits) > limit) {
541 wait_event_timeout(fs_info->async_submit_wait,
542 (atomic_read(&fs_info->nr_async_submits) < limit),
543 HZ/10);
544
545 wait_event_timeout(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_bios) < limit),
547 HZ/10);
548 }
549#endif
550 while (atomic_read(&fs_info->async_submit_draining) &&
551 atomic_read(&fs_info->nr_async_submits)) {
552 wait_event(fs_info->async_submit_wait,
553 (atomic_read(&fs_info->nr_async_submits) == 0));
554 }
555
556 return 0;
557}
558
559static int btree_csum_one_bio(struct bio *bio)
560{
561 struct bio_vec *bvec = bio->bi_io_vec;
562 int bio_index = 0;
563 struct btrfs_root *root;
564
565 WARN_ON(bio->bi_vcnt <= 0);
566 while (bio_index < bio->bi_vcnt) {
567 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
568 csum_dirty_buffer(root, bvec->bv_page);
569 bio_index++;
570 bvec++;
571 }
572 return 0;
573}
574
575static int __btree_submit_bio_start(struct inode *inode, int rw,
576 struct bio *bio, int mirror_num,
577 unsigned long bio_flags)
578{
579 /*
580 * when we're called for a write, we're already in the async
581 * submission context. Just jump into btrfs_map_bio
582 */
583 btree_csum_one_bio(bio);
584 return 0;
585}
586
587static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
588 int mirror_num, unsigned long bio_flags)
589{
590 /*
591 * when we're called for a write, we're already in the async
592 * submission context. Just jump into btrfs_map_bio
593 */
594 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
595}
596
597static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
598 int mirror_num, unsigned long bio_flags)
599{
600 int ret;
601
602 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
603 bio, 1);
604 BUG_ON(ret);
605
606 if (!(rw & (1 << BIO_RW))) {
607 /*
608 * called for a read, do the setup so that checksum validation
609 * can happen in the async kernel threads
610 */
611 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
612 mirror_num, 0);
613 }
614 /*
615 * kthread helpers are used to submit writes so that checksumming
616 * can happen in parallel across all CPUs
617 */
618 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
619 inode, rw, bio, mirror_num, 0,
620 __btree_submit_bio_start,
621 __btree_submit_bio_done);
622}
623
624static int btree_writepage(struct page *page, struct writeback_control *wbc)
625{
626 struct extent_io_tree *tree;
627 tree = &BTRFS_I(page->mapping->host)->io_tree;
628
629 if (current->flags & PF_MEMALLOC) {
630 redirty_page_for_writepage(wbc, page);
631 unlock_page(page);
632 return 0;
633 }
634 return extent_write_full_page(tree, page, btree_get_extent, wbc);
635}
636
637static int btree_writepages(struct address_space *mapping,
638 struct writeback_control *wbc)
639{
640 struct extent_io_tree *tree;
641 tree = &BTRFS_I(mapping->host)->io_tree;
642 if (wbc->sync_mode == WB_SYNC_NONE) {
643 u64 num_dirty;
644 u64 start = 0;
645 unsigned long thresh = 32 * 1024 * 1024;
646
647 if (wbc->for_kupdate)
648 return 0;
649
650 num_dirty = count_range_bits(tree, &start, (u64)-1,
651 thresh, EXTENT_DIRTY);
652 if (num_dirty < thresh)
653 return 0;
654 }
655 return extent_writepages(tree, mapping, btree_get_extent, wbc);
656}
657
658static int btree_readpage(struct file *file, struct page *page)
659{
660 struct extent_io_tree *tree;
661 tree = &BTRFS_I(page->mapping->host)->io_tree;
662 return extent_read_full_page(tree, page, btree_get_extent);
663}
664
665static int btree_releasepage(struct page *page, gfp_t gfp_flags)
666{
667 struct extent_io_tree *tree;
668 struct extent_map_tree *map;
669 int ret;
670
671 if (PageWriteback(page) || PageDirty(page))
672 return 0;
673
674 tree = &BTRFS_I(page->mapping->host)->io_tree;
675 map = &BTRFS_I(page->mapping->host)->extent_tree;
676
677 ret = try_release_extent_state(map, tree, page, gfp_flags);
678 if (!ret)
679 return 0;
680
681 ret = try_release_extent_buffer(tree, page);
682 if (ret == 1) {
683 ClearPagePrivate(page);
684 set_page_private(page, 0);
685 page_cache_release(page);
686 }
687
688 return ret;
689}
690
691static void btree_invalidatepage(struct page *page, unsigned long offset)
692{
693 struct extent_io_tree *tree;
694 tree = &BTRFS_I(page->mapping->host)->io_tree;
695 extent_invalidatepage(tree, page, offset);
696 btree_releasepage(page, GFP_NOFS);
697 if (PagePrivate(page)) {
698 printk(KERN_WARNING "btrfs warning page private not zero "
699 "on page %llu\n", (unsigned long long)page_offset(page));
700 ClearPagePrivate(page);
701 set_page_private(page, 0);
702 page_cache_release(page);
703 }
704}
705
706#if 0
707static int btree_writepage(struct page *page, struct writeback_control *wbc)
708{
709 struct buffer_head *bh;
710 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
711 struct buffer_head *head;
712 if (!page_has_buffers(page)) {
713 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
714 (1 << BH_Dirty)|(1 << BH_Uptodate));
715 }
716 head = page_buffers(page);
717 bh = head;
718 do {
719 if (buffer_dirty(bh))
720 csum_tree_block(root, bh, 0);
721 bh = bh->b_this_page;
722 } while (bh != head);
723 return block_write_full_page(page, btree_get_block, wbc);
724}
725#endif
726
727static struct address_space_operations btree_aops = {
728 .readpage = btree_readpage,
729 .writepage = btree_writepage,
730 .writepages = btree_writepages,
731 .releasepage = btree_releasepage,
732 .invalidatepage = btree_invalidatepage,
733 .sync_page = block_sync_page,
734};
735
736int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
737 u64 parent_transid)
738{
739 struct extent_buffer *buf = NULL;
740 struct inode *btree_inode = root->fs_info->btree_inode;
741 int ret = 0;
742
743 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
744 if (!buf)
745 return 0;
746 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
747 buf, 0, 0, btree_get_extent, 0);
748 free_extent_buffer(buf);
749 return ret;
750}
751
752struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
753 u64 bytenr, u32 blocksize)
754{
755 struct inode *btree_inode = root->fs_info->btree_inode;
756 struct extent_buffer *eb;
757 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
758 bytenr, blocksize, GFP_NOFS);
759 return eb;
760}
761
762struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
763 u64 bytenr, u32 blocksize)
764{
765 struct inode *btree_inode = root->fs_info->btree_inode;
766 struct extent_buffer *eb;
767
768 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
769 bytenr, blocksize, NULL, GFP_NOFS);
770 return eb;
771}
772
773
774int btrfs_write_tree_block(struct extent_buffer *buf)
775{
776 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
777 buf->start + buf->len - 1, WB_SYNC_ALL);
778}
779
780int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
781{
782 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
783 buf->start, buf->start + buf->len - 1);
784}
785
786struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
787 u32 blocksize, u64 parent_transid)
788{
789 struct extent_buffer *buf = NULL;
790 struct inode *btree_inode = root->fs_info->btree_inode;
791 struct extent_io_tree *io_tree;
792 int ret;
793
794 io_tree = &BTRFS_I(btree_inode)->io_tree;
795
796 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
797 if (!buf)
798 return NULL;
799
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801
802 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE;
804 else
805 WARN_ON(1);
806 return buf;
807
808}
809
810int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
811 struct extent_buffer *buf)
812{
813 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf));
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf);
819 }
820 return 0;
821}
822
823static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
824 u32 stripesize, struct btrfs_root *root,
825 struct btrfs_fs_info *fs_info,
826 u64 objectid)
827{
828 root->node = NULL;
829 root->commit_root = NULL;
830 root->ref_tree = NULL;
831 root->sectorsize = sectorsize;
832 root->nodesize = nodesize;
833 root->leafsize = leafsize;
834 root->stripesize = stripesize;
835 root->ref_cows = 0;
836 root->track_dirty = 0;
837
838 root->fs_info = fs_info;
839 root->objectid = objectid;
840 root->last_trans = 0;
841 root->highest_inode = 0;
842 root->last_inode_alloc = 0;
843 root->name = NULL;
844 root->in_sysfs = 0;
845
846 INIT_LIST_HEAD(&root->dirty_list);
847 INIT_LIST_HEAD(&root->orphan_list);
848 INIT_LIST_HEAD(&root->dead_list);
849 spin_lock_init(&root->node_lock);
850 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex);
853 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS);
855
856 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
857 root->ref_tree = &root->ref_tree_struct;
858
859 memset(&root->root_key, 0, sizeof(root->root_key));
860 memset(&root->root_item, 0, sizeof(root->root_item));
861 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
862 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
863 root->defrag_trans_start = fs_info->generation;
864 init_completion(&root->kobj_unregister);
865 root->defrag_running = 0;
866 root->defrag_level = 0;
867 root->root_key.objectid = objectid;
868 root->anon_super.s_root = NULL;
869 root->anon_super.s_dev = 0;
870 INIT_LIST_HEAD(&root->anon_super.s_list);
871 INIT_LIST_HEAD(&root->anon_super.s_instances);
872 init_rwsem(&root->anon_super.s_umount);
873
874 return 0;
875}
876
877static int find_and_setup_root(struct btrfs_root *tree_root,
878 struct btrfs_fs_info *fs_info,
879 u64 objectid,
880 struct btrfs_root *root)
881{
882 int ret;
883 u32 blocksize;
884 u64 generation;
885
886 __setup_root(tree_root->nodesize, tree_root->leafsize,
887 tree_root->sectorsize, tree_root->stripesize,
888 root, fs_info, objectid);
889 ret = btrfs_find_last_root(tree_root, objectid,
890 &root->root_item, &root->root_key);
891 BUG_ON(ret);
892
893 generation = btrfs_root_generation(&root->root_item);
894 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
895 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
896 blocksize, generation);
897 BUG_ON(!root->node);
898 return 0;
899}
900
901int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
902 struct btrfs_fs_info *fs_info)
903{
904 struct extent_buffer *eb;
905 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
906 u64 start = 0;
907 u64 end = 0;
908 int ret;
909
910 if (!log_root_tree)
911 return 0;
912
913 while (1) {
914 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
915 0, &start, &end, EXTENT_DIRTY);
916 if (ret)
917 break;
918
919 clear_extent_dirty(&log_root_tree->dirty_log_pages,
920 start, end, GFP_NOFS);
921 }
922 eb = fs_info->log_root_tree->node;
923
924 WARN_ON(btrfs_header_level(eb) != 0);
925 WARN_ON(btrfs_header_nritems(eb) != 0);
926
927 ret = btrfs_free_reserved_extent(fs_info->tree_root,
928 eb->start, eb->len);
929 BUG_ON(ret);
930
931 free_extent_buffer(eb);
932 kfree(fs_info->log_root_tree);
933 fs_info->log_root_tree = NULL;
934 return 0;
935}
936
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info)
939{
940 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root;
942
943 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root)
945 return -ENOMEM;
946
947 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize,
949 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
950
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
954 root->ref_cows = 0;
955
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0);
959
960 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start);
963 btrfs_set_header_generation(root->node, trans->transid);
964 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
965
966 write_extent_buffer(root->node, root->fs_info->fsid,
967 (unsigned long)btrfs_header_fsid(root->node),
968 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root;
972 return 0;
973}
974
975struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
976 struct btrfs_key *location)
977{
978 struct btrfs_root *root;
979 struct btrfs_fs_info *fs_info = tree_root->fs_info;
980 struct btrfs_path *path;
981 struct extent_buffer *l;
982 u64 highest_inode;
983 u64 generation;
984 u32 blocksize;
985 int ret = 0;
986
987 root = kzalloc(sizeof(*root), GFP_NOFS);
988 if (!root)
989 return ERR_PTR(-ENOMEM);
990 if (location->offset == (u64)-1) {
991 ret = find_and_setup_root(tree_root, fs_info,
992 location->objectid, root);
993 if (ret) {
994 kfree(root);
995 return ERR_PTR(ret);
996 }
997 goto insert;
998 }
999
1000 __setup_root(tree_root->nodesize, tree_root->leafsize,
1001 tree_root->sectorsize, tree_root->stripesize,
1002 root, fs_info, location->objectid);
1003
1004 path = btrfs_alloc_path();
1005 BUG_ON(!path);
1006 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1007 if (ret != 0) {
1008 if (ret > 0)
1009 ret = -ENOENT;
1010 goto out;
1011 }
1012 l = path->nodes[0];
1013 read_extent_buffer(l, &root->root_item,
1014 btrfs_item_ptr_offset(l, path->slots[0]),
1015 sizeof(root->root_item));
1016 memcpy(&root->root_key, location, sizeof(*location));
1017 ret = 0;
1018out:
1019 btrfs_release_path(root, path);
1020 btrfs_free_path(path);
1021 if (ret) {
1022 kfree(root);
1023 return ERR_PTR(ret);
1024 }
1025 generation = btrfs_root_generation(&root->root_item);
1026 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1027 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1028 blocksize, generation);
1029 BUG_ON(!root->node);
1030insert:
1031 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1032 root->ref_cows = 1;
1033 ret = btrfs_find_highest_inode(root, &highest_inode);
1034 if (ret == 0) {
1035 root->highest_inode = highest_inode;
1036 root->last_inode_alloc = highest_inode;
1037 }
1038 }
1039 return root;
1040}
1041
1042struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1043 u64 root_objectid)
1044{
1045 struct btrfs_root *root;
1046
1047 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1048 return fs_info->tree_root;
1049 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1050 return fs_info->extent_root;
1051
1052 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1053 (unsigned long)root_objectid);
1054 return root;
1055}
1056
1057struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1058 struct btrfs_key *location)
1059{
1060 struct btrfs_root *root;
1061 int ret;
1062
1063 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1064 return fs_info->tree_root;
1065 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1066 return fs_info->extent_root;
1067 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1068 return fs_info->chunk_root;
1069 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1070 return fs_info->dev_root;
1071 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1072 return fs_info->csum_root;
1073
1074 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1075 (unsigned long)location->objectid);
1076 if (root)
1077 return root;
1078
1079 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1080 if (IS_ERR(root))
1081 return root;
1082
1083 set_anon_super(&root->anon_super, NULL);
1084
1085 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1086 (unsigned long)root->root_key.objectid,
1087 root);
1088 if (ret) {
1089 free_extent_buffer(root->node);
1090 kfree(root);
1091 return ERR_PTR(ret);
1092 }
1093 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1094 ret = btrfs_find_dead_roots(fs_info->tree_root,
1095 root->root_key.objectid, root);
1096 BUG_ON(ret);
1097 btrfs_orphan_cleanup(root);
1098 }
1099 return root;
1100}
1101
1102struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1103 struct btrfs_key *location,
1104 const char *name, int namelen)
1105{
1106 struct btrfs_root *root;
1107 int ret;
1108
1109 root = btrfs_read_fs_root_no_name(fs_info, location);
1110 if (!root)
1111 return NULL;
1112
1113 if (root->in_sysfs)
1114 return root;
1115
1116 ret = btrfs_set_root_name(root, name, namelen);
1117 if (ret) {
1118 free_extent_buffer(root->node);
1119 kfree(root);
1120 return ERR_PTR(ret);
1121 }
1122#if 0
1123 ret = btrfs_sysfs_add_root(root);
1124 if (ret) {
1125 free_extent_buffer(root->node);
1126 kfree(root->name);
1127 kfree(root);
1128 return ERR_PTR(ret);
1129 }
1130#endif
1131 root->in_sysfs = 1;
1132 return root;
1133}
1134
1135static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device;
1141 struct backing_dev_info *bdi;
1142#if 0
1143 if ((bdi_bits & (1 << BDI_write_congested)) &&
1144 btrfs_congested_async(info, 0))
1145 return 1;
1146#endif
1147 list_for_each(cur, &info->fs_devices->devices) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev)
1150 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev);
1152 if (bdi && bdi_congested(bdi, bdi_bits)) {
1153 ret = 1;
1154 break;
1155 }
1156 }
1157 return ret;
1158}
1159
1160/*
1161 * this unplugs every device on the box, and it is only used when page
1162 * is null
1163 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{
1166 struct list_head *cur;
1167 struct btrfs_device *device;
1168 struct btrfs_fs_info *info;
1169
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev)
1174 continue;
1175
1176 bdi = blk_get_backing_dev_info(device->bdev);
1177 if (bdi->unplug_io_fn)
1178 bdi->unplug_io_fn(bdi, page);
1179 }
1180}
1181
1182static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1183{
1184 struct inode *inode;
1185 struct extent_map_tree *em_tree;
1186 struct extent_map *em;
1187 struct address_space *mapping;
1188 u64 offset;
1189
1190 /* the generic O_DIRECT read code does this */
1191 if (1 || !page) {
1192 __unplug_io_fn(bdi, page);
1193 return;
1194 }
1195
1196 /*
1197 * page->mapping may change at any time. Get a consistent copy
1198 * and use that for everything below
1199 */
1200 smp_mb();
1201 mapping = page->mapping;
1202 if (!mapping)
1203 return;
1204
1205 inode = mapping->host;
1206
1207 /*
1208 * don't do the expensive searching for a small number of
1209 * devices
1210 */
1211 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1212 __unplug_io_fn(bdi, page);
1213 return;
1214 }
1215
1216 offset = page_offset(page);
1217
1218 em_tree = &BTRFS_I(inode)->extent_tree;
1219 spin_lock(&em_tree->lock);
1220 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1221 spin_unlock(&em_tree->lock);
1222 if (!em) {
1223 __unplug_io_fn(bdi, page);
1224 return;
1225 }
1226
1227 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1228 free_extent_map(em);
1229 __unplug_io_fn(bdi, page);
1230 return;
1231 }
1232 offset = offset - em->start;
1233 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1234 em->block_start + offset, page);
1235 free_extent_map(em);
1236}
1237
1238static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1239{
1240 bdi_init(bdi);
1241 bdi->ra_pages = default_backing_dev_info.ra_pages;
1242 bdi->state = 0;
1243 bdi->capabilities = default_backing_dev_info.capabilities;
1244 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1245 bdi->unplug_io_data = info;
1246 bdi->congested_fn = btrfs_congested_fn;
1247 bdi->congested_data = info;
1248 return 0;
1249}
1250
1251static int bio_ready_for_csum(struct bio *bio)
1252{
1253 u64 length = 0;
1254 u64 buf_len = 0;
1255 u64 start = 0;
1256 struct page *page;
1257 struct extent_io_tree *io_tree = NULL;
1258 struct btrfs_fs_info *info = NULL;
1259 struct bio_vec *bvec;
1260 int i;
1261 int ret;
1262
1263 bio_for_each_segment(bvec, bio, i) {
1264 page = bvec->bv_page;
1265 if (page->private == EXTENT_PAGE_PRIVATE) {
1266 length += bvec->bv_len;
1267 continue;
1268 }
1269 if (!page->private) {
1270 length += bvec->bv_len;
1271 continue;
1272 }
1273 length = bvec->bv_len;
1274 buf_len = page->private >> 2;
1275 start = page_offset(page) + bvec->bv_offset;
1276 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1277 info = BTRFS_I(page->mapping->host)->root->fs_info;
1278 }
1279 /* are we fully contained in this bio? */
1280 if (buf_len <= length)
1281 return 1;
1282
1283 ret = extent_range_uptodate(io_tree, start + length,
1284 start + buf_len - 1);
1285 if (ret == 1)
1286 return ret;
1287 return ret;
1288}
1289
1290/*
1291 * called by the kthread helper functions to finally call the bio end_io
1292 * functions. This is where read checksum verification actually happens
1293 */
1294static void end_workqueue_fn(struct btrfs_work *work)
1295{
1296 struct bio *bio;
1297 struct end_io_wq *end_io_wq;
1298 struct btrfs_fs_info *fs_info;
1299 int error;
1300
1301 end_io_wq = container_of(work, struct end_io_wq, work);
1302 bio = end_io_wq->bio;
1303 fs_info = end_io_wq->info;
1304
1305 /* metadata bio reads are special because the whole tree block must
1306 * be checksummed at once. This makes sure the entire block is in
1307 * ram and up to date before trying to verify things. For
1308 * blocksize <= pagesize, it is basically a noop
1309 */
1310 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
1311 !bio_ready_for_csum(bio)) {
1312 btrfs_queue_worker(&fs_info->endio_meta_workers,
1313 &end_io_wq->work);
1314 return;
1315 }
1316 error = end_io_wq->error;
1317 bio->bi_private = end_io_wq->private;
1318 bio->bi_end_io = end_io_wq->end_io;
1319 kfree(end_io_wq);
1320 bio_endio(bio, error);
1321}
1322
1323static int cleaner_kthread(void *arg)
1324{
1325 struct btrfs_root *root = arg;
1326
1327 do {
1328 smp_mb();
1329 if (root->fs_info->closing)
1330 break;
1331
1332 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1333 mutex_lock(&root->fs_info->cleaner_mutex);
1334 btrfs_clean_old_snapshots(root);
1335 mutex_unlock(&root->fs_info->cleaner_mutex);
1336
1337 if (freezing(current)) {
1338 refrigerator();
1339 } else {
1340 smp_mb();
1341 if (root->fs_info->closing)
1342 break;
1343 set_current_state(TASK_INTERRUPTIBLE);
1344 schedule();
1345 __set_current_state(TASK_RUNNING);
1346 }
1347 } while (!kthread_should_stop());
1348 return 0;
1349}
1350
1351static int transaction_kthread(void *arg)
1352{
1353 struct btrfs_root *root = arg;
1354 struct btrfs_trans_handle *trans;
1355 struct btrfs_transaction *cur;
1356 unsigned long now;
1357 unsigned long delay;
1358 int ret;
1359
1360 do {
1361 smp_mb();
1362 if (root->fs_info->closing)
1363 break;
1364
1365 delay = HZ * 30;
1366 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1367 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1368
1369 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1370 printk(KERN_INFO "btrfs: total reference cache "
1371 "size %llu\n",
1372 root->fs_info->total_ref_cache_size);
1373 }
1374
1375 mutex_lock(&root->fs_info->trans_mutex);
1376 cur = root->fs_info->running_transaction;
1377 if (!cur) {
1378 mutex_unlock(&root->fs_info->trans_mutex);
1379 goto sleep;
1380 }
1381
1382 now = get_seconds();
1383 if (now < cur->start_time || now - cur->start_time < 30) {
1384 mutex_unlock(&root->fs_info->trans_mutex);
1385 delay = HZ * 5;
1386 goto sleep;
1387 }
1388 mutex_unlock(&root->fs_info->trans_mutex);
1389 trans = btrfs_start_transaction(root, 1);
1390 ret = btrfs_commit_transaction(trans, root);
1391sleep:
1392 wake_up_process(root->fs_info->cleaner_kthread);
1393 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1394
1395 if (freezing(current)) {
1396 refrigerator();
1397 } else {
1398 if (root->fs_info->closing)
1399 break;
1400 set_current_state(TASK_INTERRUPTIBLE);
1401 schedule_timeout(delay);
1402 __set_current_state(TASK_RUNNING);
1403 }
1404 } while (!kthread_should_stop());
1405 return 0;
1406}
1407
1408struct btrfs_root *open_ctree(struct super_block *sb,
1409 struct btrfs_fs_devices *fs_devices,
1410 char *options)
1411{
1412 u32 sectorsize;
1413 u32 nodesize;
1414 u32 leafsize;
1415 u32 blocksize;
1416 u32 stripesize;
1417 u64 generation;
1418 u64 features;
1419 struct btrfs_key location;
1420 struct buffer_head *bh;
1421 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1422 GFP_NOFS);
1423 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1424 GFP_NOFS);
1425 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1426 GFP_NOFS);
1427 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1428 GFP_NOFS);
1429 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1430 GFP_NOFS);
1431 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1432 GFP_NOFS);
1433 struct btrfs_root *log_tree_root;
1434
1435 int ret;
1436 int err = -EINVAL;
1437
1438 struct btrfs_super_block *disk_super;
1439
1440 if (!extent_root || !tree_root || !fs_info ||
1441 !chunk_root || !dev_root || !csum_root) {
1442 err = -ENOMEM;
1443 goto fail;
1444 }
1445 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1446 INIT_LIST_HEAD(&fs_info->trans_list);
1447 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock);
1454
1455 init_completion(&fs_info->kobj_unregister);
1456 fs_info->tree_root = tree_root;
1457 fs_info->extent_root = extent_root;
1458 fs_info->csum_root = csum_root;
1459 fs_info->chunk_root = chunk_root;
1460 fs_info->dev_root = dev_root;
1461 fs_info->fs_devices = fs_devices;
1462 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1463 INIT_LIST_HEAD(&fs_info->space_info);
1464 btrfs_mapping_init(&fs_info->mapping_tree);
1465 atomic_set(&fs_info->nr_async_submits, 0);
1466 atomic_set(&fs_info->async_delalloc_pages, 0);
1467 atomic_set(&fs_info->async_submit_draining, 0);
1468 atomic_set(&fs_info->nr_async_bios, 0);
1469 atomic_set(&fs_info->throttles, 0);
1470 atomic_set(&fs_info->throttle_gen, 0);
1471 fs_info->sb = sb;
1472 fs_info->max_extent = (u64)-1;
1473 fs_info->max_inline = 8192 * 1024;
1474 setup_bdi(fs_info, &fs_info->bdi);
1475 fs_info->btree_inode = new_inode(sb);
1476 fs_info->btree_inode->i_ino = 1;
1477 fs_info->btree_inode->i_nlink = 1;
1478
1479 fs_info->thread_pool_size = min_t(unsigned long,
1480 num_online_cpus() + 2, 8);
1481
1482 INIT_LIST_HEAD(&fs_info->ordered_extents);
1483 spin_lock_init(&fs_info->ordered_extent_lock);
1484
1485 sb->s_blocksize = 4096;
1486 sb->s_blocksize_bits = blksize_bits(4096);
1487
1488 /*
1489 * we set the i_size on the btree inode to the max possible int.
1490 * the real end of the address space is determined by all of
1491 * the devices in the system
1492 */
1493 fs_info->btree_inode->i_size = OFFSET_MAX;
1494 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1495 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1496
1497 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1498 fs_info->btree_inode->i_mapping,
1499 GFP_NOFS);
1500 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1501 GFP_NOFS);
1502
1503 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1504
1505 spin_lock_init(&fs_info->block_group_cache_lock);
1506 fs_info->block_group_cache_tree.rb_node = NULL;
1507
1508 extent_io_tree_init(&fs_info->pinned_extents,
1509 fs_info->btree_inode->i_mapping, GFP_NOFS);
1510 extent_io_tree_init(&fs_info->pending_del,
1511 fs_info->btree_inode->i_mapping, GFP_NOFS);
1512 extent_io_tree_init(&fs_info->extent_ins,
1513 fs_info->btree_inode->i_mapping, GFP_NOFS);
1514 fs_info->do_barriers = 1;
1515
1516 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1517 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1518 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1519
1520 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1521 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1522 sizeof(struct btrfs_key));
1523 insert_inode_hash(fs_info->btree_inode);
1524
1525 mutex_init(&fs_info->trans_mutex);
1526 mutex_init(&fs_info->tree_log_mutex);
1527 mutex_init(&fs_info->drop_mutex);
1528 mutex_init(&fs_info->extent_ins_mutex);
1529 mutex_init(&fs_info->pinned_mutex);
1530 mutex_init(&fs_info->chunk_mutex);
1531 mutex_init(&fs_info->transaction_kthread_mutex);
1532 mutex_init(&fs_info->cleaner_mutex);
1533 mutex_init(&fs_info->volume_mutex);
1534 mutex_init(&fs_info->tree_reloc_mutex);
1535 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542
1543 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1545
1546
1547 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1548 if (!bh)
1549 goto fail_iput;
1550
1551 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1552 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
1553 sizeof(fs_info->super_for_commit));
1554 brelse(bh);
1555
1556 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1557
1558 disk_super = &fs_info->super_copy;
1559 if (!btrfs_super_root(disk_super))
1560 goto fail_iput;
1561
1562 ret = btrfs_parse_options(tree_root, options);
1563 if (ret) {
1564 err = ret;
1565 goto fail_iput;
1566 }
1567
1568 features = btrfs_super_incompat_flags(disk_super) &
1569 ~BTRFS_FEATURE_INCOMPAT_SUPP;
1570 if (features) {
1571 printk(KERN_ERR "BTRFS: couldn't mount because of "
1572 "unsupported optional features (%Lx).\n",
1573 features);
1574 err = -EINVAL;
1575 goto fail_iput;
1576 }
1577
1578 features = btrfs_super_compat_ro_flags(disk_super) &
1579 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1580 if (!(sb->s_flags & MS_RDONLY) && features) {
1581 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1582 "unsupported option features (%Lx).\n",
1583 features);
1584 err = -EINVAL;
1585 goto fail_iput;
1586 }
1587
1588 /*
1589 * we need to start all the end_io workers up front because the
1590 * queue work function gets called at interrupt time, and so it
1591 * cannot dynamically grow.
1592 */
1593 btrfs_init_workers(&fs_info->workers, "worker",
1594 fs_info->thread_pool_size);
1595
1596 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1597 fs_info->thread_pool_size);
1598
1599 btrfs_init_workers(&fs_info->submit_workers, "submit",
1600 min_t(u64, fs_devices->num_devices,
1601 fs_info->thread_pool_size));
1602
1603 /* a higher idle thresh on the submit workers makes it much more
1604 * likely that bios will be send down in a sane order to the
1605 * devices
1606 */
1607 fs_info->submit_workers.idle_thresh = 64;
1608
1609 fs_info->workers.idle_thresh = 16;
1610 fs_info->workers.ordered = 1;
1611
1612 fs_info->delalloc_workers.idle_thresh = 2;
1613 fs_info->delalloc_workers.ordered = 1;
1614
1615 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1616 btrfs_init_workers(&fs_info->endio_workers, "endio",
1617 fs_info->thread_pool_size);
1618 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1619 fs_info->thread_pool_size);
1620 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1621 "endio-meta-write", fs_info->thread_pool_size);
1622 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1623 fs_info->thread_pool_size);
1624
1625 /*
1626 * endios are largely parallel and should have a very
1627 * low idle thresh
1628 */
1629 fs_info->endio_workers.idle_thresh = 4;
1630 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632
1633 btrfs_start_workers(&fs_info->workers, 1);
1634 btrfs_start_workers(&fs_info->submit_workers, 1);
1635 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1636 btrfs_start_workers(&fs_info->fixup_workers, 1);
1637 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1638 btrfs_start_workers(&fs_info->endio_meta_workers,
1639 fs_info->thread_pool_size);
1640 btrfs_start_workers(&fs_info->endio_meta_write_workers,
1641 fs_info->thread_pool_size);
1642 btrfs_start_workers(&fs_info->endio_write_workers,
1643 fs_info->thread_pool_size);
1644
1645 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1646 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1647 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1648
1649 nodesize = btrfs_super_nodesize(disk_super);
1650 leafsize = btrfs_super_leafsize(disk_super);
1651 sectorsize = btrfs_super_sectorsize(disk_super);
1652 stripesize = btrfs_super_stripesize(disk_super);
1653 tree_root->nodesize = nodesize;
1654 tree_root->leafsize = leafsize;
1655 tree_root->sectorsize = sectorsize;
1656 tree_root->stripesize = stripesize;
1657
1658 sb->s_blocksize = sectorsize;
1659 sb->s_blocksize_bits = blksize_bits(sectorsize);
1660
1661 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1662 sizeof(disk_super->magic))) {
1663 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
1664 goto fail_sb_buffer;
1665 }
1666
1667 mutex_lock(&fs_info->chunk_mutex);
1668 ret = btrfs_read_sys_array(tree_root);
1669 mutex_unlock(&fs_info->chunk_mutex);
1670 if (ret) {
1671 printk(KERN_WARNING "btrfs: failed to read the system "
1672 "array on %s\n", sb->s_id);
1673 goto fail_sys_array;
1674 }
1675
1676 blocksize = btrfs_level_size(tree_root,
1677 btrfs_super_chunk_root_level(disk_super));
1678 generation = btrfs_super_chunk_root_generation(disk_super);
1679
1680 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1681 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1682
1683 chunk_root->node = read_tree_block(chunk_root,
1684 btrfs_super_chunk_root(disk_super),
1685 blocksize, generation);
1686 BUG_ON(!chunk_root->node);
1687
1688 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1689 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1690 BTRFS_UUID_SIZE);
1691
1692 mutex_lock(&fs_info->chunk_mutex);
1693 ret = btrfs_read_chunk_tree(chunk_root);
1694 mutex_unlock(&fs_info->chunk_mutex);
1695 if (ret) {
1696 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1697 sb->s_id);
1698 goto fail_chunk_root;
1699 }
1700
1701 btrfs_close_extra_devices(fs_devices);
1702
1703 blocksize = btrfs_level_size(tree_root,
1704 btrfs_super_root_level(disk_super));
1705 generation = btrfs_super_generation(disk_super);
1706
1707 tree_root->node = read_tree_block(tree_root,
1708 btrfs_super_root(disk_super),
1709 blocksize, generation);
1710 if (!tree_root->node)
1711 goto fail_chunk_root;
1712
1713
1714 ret = find_and_setup_root(tree_root, fs_info,
1715 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1716 if (ret)
1717 goto fail_tree_root;
1718 extent_root->track_dirty = 1;
1719
1720 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1;
1723
1724 if (ret)
1725 goto fail_extent_root;
1726
1727 ret = find_and_setup_root(tree_root, fs_info,
1728 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1729 if (ret)
1730 goto fail_extent_root;
1731
1732 csum_root->track_dirty = 1;
1733
1734 btrfs_read_block_groups(extent_root);
1735
1736 fs_info->generation = generation;
1737 fs_info->last_trans_committed = generation;
1738 fs_info->data_alloc_profile = (u64)-1;
1739 fs_info->metadata_alloc_profile = (u64)-1;
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread)
1744 goto fail_csum_root;
1745
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root,
1748 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread)
1750 goto fail_cleaner;
1751
1752 if (btrfs_super_log_root(disk_super) != 0) {
1753 u64 bytenr = btrfs_super_log_root(disk_super);
1754
1755 if (fs_devices->rw_devices == 0) {
1756 printk(KERN_WARNING "Btrfs log replay required "
1757 "on RO media\n");
1758 err = -EIO;
1759 goto fail_trans_kthread;
1760 }
1761 blocksize =
1762 btrfs_level_size(tree_root,
1763 btrfs_super_log_root_level(disk_super));
1764
1765 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1766 GFP_NOFS);
1767
1768 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1769 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1770
1771 log_tree_root->node = read_tree_block(tree_root, bytenr,
1772 blocksize,
1773 generation + 1);
1774 ret = btrfs_recover_log_trees(log_tree_root);
1775 BUG_ON(ret);
1776
1777 if (sb->s_flags & MS_RDONLY) {
1778 ret = btrfs_commit_super(tree_root);
1779 BUG_ON(ret);
1780 }
1781 }
1782
1783 if (!(sb->s_flags & MS_RDONLY)) {
1784 ret = btrfs_cleanup_reloc_trees(tree_root);
1785 BUG_ON(ret);
1786 }
1787
1788 location.objectid = BTRFS_FS_TREE_OBJECTID;
1789 location.type = BTRFS_ROOT_ITEM_KEY;
1790 location.offset = (u64)-1;
1791
1792 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1793 if (!fs_info->fs_root)
1794 goto fail_trans_kthread;
1795 return tree_root;
1796
1797fail_trans_kthread:
1798 kthread_stop(fs_info->transaction_kthread);
1799fail_cleaner:
1800 kthread_stop(fs_info->cleaner_kthread);
1801
1802 /*
1803 * make sure we're done with the btree inode before we stop our
1804 * kthreads
1805 */
1806 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1807 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1808
1809fail_csum_root:
1810 free_extent_buffer(csum_root->node);
1811fail_extent_root:
1812 free_extent_buffer(extent_root->node);
1813fail_tree_root:
1814 free_extent_buffer(tree_root->node);
1815fail_chunk_root:
1816 free_extent_buffer(chunk_root->node);
1817fail_sys_array:
1818 free_extent_buffer(dev_root->node);
1819fail_sb_buffer:
1820 btrfs_stop_workers(&fs_info->fixup_workers);
1821 btrfs_stop_workers(&fs_info->delalloc_workers);
1822 btrfs_stop_workers(&fs_info->workers);
1823 btrfs_stop_workers(&fs_info->endio_workers);
1824 btrfs_stop_workers(&fs_info->endio_meta_workers);
1825 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1826 btrfs_stop_workers(&fs_info->endio_write_workers);
1827 btrfs_stop_workers(&fs_info->submit_workers);
1828fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode);
1831fail:
1832 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1834
1835 kfree(extent_root);
1836 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info);
1839 kfree(chunk_root);
1840 kfree(dev_root);
1841 kfree(csum_root);
1842 return ERR_PTR(err);
1843}
1844
1845static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1846{
1847 char b[BDEVNAME_SIZE];
1848
1849 if (uptodate) {
1850 set_buffer_uptodate(bh);
1851 } else {
1852 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1853 printk(KERN_WARNING "lost page write due to "
1854 "I/O error on %s\n",
1855 bdevname(bh->b_bdev, b));
1856 }
1857 /* note, we dont' set_buffer_write_io_error because we have
1858 * our own ways of dealing with the IO errors
1859 */
1860 clear_buffer_uptodate(bh);
1861 }
1862 unlock_buffer(bh);
1863 put_bh(bh);
1864}
1865
1866struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
1867{
1868 struct buffer_head *bh;
1869 struct buffer_head *latest = NULL;
1870 struct btrfs_super_block *super;
1871 int i;
1872 u64 transid = 0;
1873 u64 bytenr;
1874
1875 /* we would like to check all the supers, but that would make
1876 * a btrfs mount succeed after a mkfs from a different FS.
1877 * So, we need to add a special mount option to scan for
1878 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1879 */
1880 for (i = 0; i < 1; i++) {
1881 bytenr = btrfs_sb_offset(i);
1882 if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
1883 break;
1884 bh = __bread(bdev, bytenr / 4096, 4096);
1885 if (!bh)
1886 continue;
1887
1888 super = (struct btrfs_super_block *)bh->b_data;
1889 if (btrfs_super_bytenr(super) != bytenr ||
1890 strncmp((char *)(&super->magic), BTRFS_MAGIC,
1891 sizeof(super->magic))) {
1892 brelse(bh);
1893 continue;
1894 }
1895
1896 if (!latest || btrfs_super_generation(super) > transid) {
1897 brelse(latest);
1898 latest = bh;
1899 transid = btrfs_super_generation(super);
1900 } else {
1901 brelse(bh);
1902 }
1903 }
1904 return latest;
1905}
1906
1907static int write_dev_supers(struct btrfs_device *device,
1908 struct btrfs_super_block *sb,
1909 int do_barriers, int wait, int max_mirrors)
1910{
1911 struct buffer_head *bh;
1912 int i;
1913 int ret;
1914 int errors = 0;
1915 u32 crc;
1916 u64 bytenr;
1917 int last_barrier = 0;
1918
1919 if (max_mirrors == 0)
1920 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
1921
1922 /* make sure only the last submit_bh does a barrier */
1923 if (do_barriers) {
1924 for (i = 0; i < max_mirrors; i++) {
1925 bytenr = btrfs_sb_offset(i);
1926 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1927 device->total_bytes)
1928 break;
1929 last_barrier = i;
1930 }
1931 }
1932
1933 for (i = 0; i < max_mirrors; i++) {
1934 bytenr = btrfs_sb_offset(i);
1935 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1936 break;
1937
1938 if (wait) {
1939 bh = __find_get_block(device->bdev, bytenr / 4096,
1940 BTRFS_SUPER_INFO_SIZE);
1941 BUG_ON(!bh);
1942 brelse(bh);
1943 wait_on_buffer(bh);
1944 if (buffer_uptodate(bh)) {
1945 brelse(bh);
1946 continue;
1947 }
1948 } else {
1949 btrfs_set_super_bytenr(sb, bytenr);
1950
1951 crc = ~(u32)0;
1952 crc = btrfs_csum_data(NULL, (char *)sb +
1953 BTRFS_CSUM_SIZE, crc,
1954 BTRFS_SUPER_INFO_SIZE -
1955 BTRFS_CSUM_SIZE);
1956 btrfs_csum_final(crc, sb->csum);
1957
1958 bh = __getblk(device->bdev, bytenr / 4096,
1959 BTRFS_SUPER_INFO_SIZE);
1960 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1961
1962 set_buffer_uptodate(bh);
1963 get_bh(bh);
1964 lock_buffer(bh);
1965 bh->b_end_io = btrfs_end_buffer_write_sync;
1966 }
1967
1968 if (i == last_barrier && do_barriers && device->barriers) {
1969 ret = submit_bh(WRITE_BARRIER, bh);
1970 if (ret == -EOPNOTSUPP) {
1971 printk("btrfs: disabling barriers on dev %s\n",
1972 device->name);
1973 set_buffer_uptodate(bh);
1974 device->barriers = 0;
1975 get_bh(bh);
1976 lock_buffer(bh);
1977 ret = submit_bh(WRITE, bh);
1978 }
1979 } else {
1980 ret = submit_bh(WRITE, bh);
1981 }
1982
1983 if (!ret && wait) {
1984 wait_on_buffer(bh);
1985 if (!buffer_uptodate(bh))
1986 errors++;
1987 } else if (ret) {
1988 errors++;
1989 }
1990 if (wait)
1991 brelse(bh);
1992 }
1993 return errors < i ? 0 : -1;
1994}
1995
1996int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb;
2002 struct btrfs_dev_item *dev_item;
2003 int ret;
2004 int do_barriers;
2005 int max_errors;
2006 int total_errors = 0;
2007 u64 flags;
2008
2009 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
2010 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2011
2012 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) {
2017 total_errors++;
2018 continue;
2019 }
2020 if (!dev->in_fs_metadata || !dev->writeable)
2021 continue;
2022
2023 btrfs_set_stack_device_generation(dev_item, 0);
2024 btrfs_set_stack_device_type(dev_item, dev->type);
2025 btrfs_set_stack_device_id(dev_item, dev->devid);
2026 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2027 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2028 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2029 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2030 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2031 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2032 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2033
2034 flags = btrfs_super_flags(sb);
2035 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2036
2037 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2038 if (ret)
2039 total_errors++;
2040 }
2041 if (total_errors > max_errors) {
2042 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2043 total_errors);
2044 BUG();
2045 }
2046
2047 total_errors = 0;
2048 list_for_each(cur, head) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev)
2051 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable)
2053 continue;
2054
2055 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2056 if (ret)
2057 total_errors++;
2058 }
2059 if (total_errors > max_errors) {
2060 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2061 total_errors);
2062 BUG();
2063 }
2064 return 0;
2065}
2066
2067int write_ctree_super(struct btrfs_trans_handle *trans,
2068 struct btrfs_root *root, int max_mirrors)
2069{
2070 int ret;
2071
2072 ret = write_all_supers(root, max_mirrors);
2073 return ret;
2074}
2075
2076int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2077{
2078 radix_tree_delete(&fs_info->fs_roots_radix,
2079 (unsigned long)root->root_key.objectid);
2080 if (root->anon_super.s_dev) {
2081 down_write(&root->anon_super.s_umount);
2082 kill_anon_super(&root->anon_super);
2083 }
2084 if (root->node)
2085 free_extent_buffer(root->node);
2086 if (root->commit_root)
2087 free_extent_buffer(root->commit_root);
2088 kfree(root->name);
2089 kfree(root);
2090 return 0;
2091}
2092
2093static int del_fs_roots(struct btrfs_fs_info *fs_info)
2094{
2095 int ret;
2096 struct btrfs_root *gang[8];
2097 int i;
2098
2099 while (1) {
2100 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2101 (void **)gang, 0,
2102 ARRAY_SIZE(gang));
2103 if (!ret)
2104 break;
2105 for (i = 0; i < ret; i++)
2106 btrfs_free_fs_root(fs_info, gang[i]);
2107 }
2108 return 0;
2109}
2110
2111int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2112{
2113 u64 root_objectid = 0;
2114 struct btrfs_root *gang[8];
2115 int i;
2116 int ret;
2117
2118 while (1) {
2119 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2120 (void **)gang, root_objectid,
2121 ARRAY_SIZE(gang));
2122 if (!ret)
2123 break;
2124 for (i = 0; i < ret; i++) {
2125 root_objectid = gang[i]->root_key.objectid;
2126 ret = btrfs_find_dead_roots(fs_info->tree_root,
2127 root_objectid, gang[i]);
2128 BUG_ON(ret);
2129 btrfs_orphan_cleanup(gang[i]);
2130 }
2131 root_objectid++;
2132 }
2133 return 0;
2134}
2135
2136int btrfs_commit_super(struct btrfs_root *root)
2137{
2138 struct btrfs_trans_handle *trans;
2139 int ret;
2140
2141 mutex_lock(&root->fs_info->cleaner_mutex);
2142 btrfs_clean_old_snapshots(root);
2143 mutex_unlock(&root->fs_info->cleaner_mutex);
2144 trans = btrfs_start_transaction(root, 1);
2145 ret = btrfs_commit_transaction(trans, root);
2146 BUG_ON(ret);
2147 /* run commit again to drop the original snapshot */
2148 trans = btrfs_start_transaction(root, 1);
2149 btrfs_commit_transaction(trans, root);
2150 ret = btrfs_write_and_wait_transaction(NULL, root);
2151 BUG_ON(ret);
2152
2153 ret = write_ctree_super(NULL, root, 0);
2154 return ret;
2155}
2156
2157int close_ctree(struct btrfs_root *root)
2158{
2159 struct btrfs_fs_info *fs_info = root->fs_info;
2160 int ret;
2161
2162 fs_info->closing = 1;
2163 smp_mb();
2164
2165 kthread_stop(root->fs_info->transaction_kthread);
2166 kthread_stop(root->fs_info->cleaner_kthread);
2167
2168 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2169 ret = btrfs_commit_super(root);
2170 if (ret)
2171 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2172 }
2173
2174 if (fs_info->delalloc_bytes) {
2175 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2176 fs_info->delalloc_bytes);
2177 }
2178 if (fs_info->total_ref_cache_size) {
2179 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
2180 (unsigned long long)fs_info->total_ref_cache_size);
2181 }
2182
2183 if (fs_info->extent_root->node)
2184 free_extent_buffer(fs_info->extent_root->node);
2185
2186 if (fs_info->tree_root->node)
2187 free_extent_buffer(fs_info->tree_root->node);
2188
2189 if (root->fs_info->chunk_root->node)
2190 free_extent_buffer(root->fs_info->chunk_root->node);
2191
2192 if (root->fs_info->dev_root->node)
2193 free_extent_buffer(root->fs_info->dev_root->node);
2194
2195 if (root->fs_info->csum_root->node)
2196 free_extent_buffer(root->fs_info->csum_root->node);
2197
2198 btrfs_free_block_groups(root->fs_info);
2199
2200 del_fs_roots(fs_info);
2201
2202 iput(fs_info->btree_inode);
2203
2204 btrfs_stop_workers(&fs_info->fixup_workers);
2205 btrfs_stop_workers(&fs_info->delalloc_workers);
2206 btrfs_stop_workers(&fs_info->workers);
2207 btrfs_stop_workers(&fs_info->endio_workers);
2208 btrfs_stop_workers(&fs_info->endio_meta_workers);
2209 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2210 btrfs_stop_workers(&fs_info->endio_write_workers);
2211 btrfs_stop_workers(&fs_info->submit_workers);
2212
2213#if 0
2214 while (!list_empty(&fs_info->hashers)) {
2215 struct btrfs_hasher *hasher;
2216 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2217 hashers);
2218 list_del(&hasher->hashers);
2219 crypto_free_hash(&fs_info->hash_tfm);
2220 kfree(hasher);
2221 }
2222#endif
2223 btrfs_close_devices(fs_info->fs_devices);
2224 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2225
2226 bdi_destroy(&fs_info->bdi);
2227
2228 kfree(fs_info->extent_root);
2229 kfree(fs_info->tree_root);
2230 kfree(fs_info->chunk_root);
2231 kfree(fs_info->dev_root);
2232 kfree(fs_info->csum_root);
2233 return 0;
2234}
2235
2236int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2237{
2238 int ret;
2239 struct inode *btree_inode = buf->first_page->mapping->host;
2240
2241 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2242 if (!ret)
2243 return ret;
2244
2245 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2246 parent_transid);
2247 return !ret;
2248}
2249
2250int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2251{
2252 struct inode *btree_inode = buf->first_page->mapping->host;
2253 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2254 buf);
2255}
2256
2257void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2258{
2259 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2260 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode;
2262
2263 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n",
2267 (unsigned long long)buf->start,
2268 (unsigned long long)transid,
2269 (unsigned long long)root->fs_info->generation);
2270 WARN_ON(1);
2271 }
2272 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2273}
2274
2275void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2276{
2277 /*
2278 * looks as though older kernels can get into trouble with
2279 * this code, they end up stuck in balance_dirty_pages forever
2280 */
2281 struct extent_io_tree *tree;
2282 u64 num_dirty;
2283 u64 start = 0;
2284 unsigned long thresh = 32 * 1024 * 1024;
2285 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2286
2287 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2288 return;
2289
2290 num_dirty = count_range_bits(tree, &start, (u64)-1,
2291 thresh, EXTENT_DIRTY);
2292 if (num_dirty > thresh) {
2293 balance_dirty_pages_ratelimited_nr(
2294 root->fs_info->btree_inode->i_mapping, 1);
2295 }
2296 return;
2297}
2298
2299int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2300{
2301 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2302 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE;
2306 return ret;
2307}
2308
2309int btree_lock_page_hook(struct page *page)
2310{
2311 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb;
2315 unsigned long len;
2316 u64 bytenr = page_offset(page);
2317
2318 if (page->private == EXTENT_PAGE_PRIVATE)
2319 goto out;
2320
2321 len = page->private >> 2;
2322 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2323 if (!eb)
2324 goto out;
2325
2326 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb);
2332out:
2333 lock_page(page);
2334 return 0;
2335}
2336
2337static struct extent_io_ops btree_extent_io_ops = {
2338 .write_cache_pages_lock_hook = btree_lock_page_hook,
2339 .readpage_end_io_hook = btree_readpage_end_io_hook,
2340 .submit_bio_hook = btree_submit_bio_hook,
2341 /* note we're sharing with inode.c for the merge bio hook */
2342 .merge_bio_hook = btrfs_merge_bio_hook,
2343};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24
25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12
27
28static inline u64 btrfs_sb_offset(int mirror)
29{
30 u64 start = 16 * 1024;
31 if (mirror)
32 return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
33 return BTRFS_SUPER_INFO_OFFSET;
34}
35
36struct btrfs_device;
37struct btrfs_fs_devices;
38
39struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root, struct extent_buffer *buf);
47struct btrfs_root *open_ctree(struct super_block *sb,
48 struct btrfs_fs_devices *fs_devices,
49 char *options);
50int close_ctree(struct btrfs_root *root);
51int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
58 u64 root_objectid);
59struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
60 struct btrfs_key *location,
61 const char *name, int namelen);
62struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
63 struct btrfs_key *location);
64struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
65 struct btrfs_key *location);
66int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
67int btrfs_insert_dev_radix(struct btrfs_root *root,
68 struct block_device *bdev,
69 u64 device_id,
70 u64 block_start,
71 u64 num_blocks);
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root,
78 struct extent_buffer *buf);
79int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
80u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
81void btrfs_csum_final(u32 crc, char *result);
82int btrfs_open_device(struct btrfs_device *dev);
83int btrfs_verify_block_csum(struct btrfs_root *root,
84 struct extent_buffer *buf);
85int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
86 int metadata);
87int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
88 int rw, struct bio *bio, int mirror_num,
89 unsigned long bio_flags,
90 extent_submit_bio_hook_t *submit_bio_start,
91 extent_submit_bio_hook_t *submit_bio_done);
92
93int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
94unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
95int btrfs_write_tree_block(struct extent_buffer *buf);
96int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
97int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info);
101int btree_lock_page_hook(struct page *page);
102#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
11 parent_objectid) / 4)
12#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
13 parent_root_objectid) / 4)
14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
15
16static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
17 int connectable)
18{
19 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
20 struct inode *inode = dentry->d_inode;
21 int len = *max_len;
22 int type;
23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
26 return 255;
27
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT;
30
31 fid->objectid = BTRFS_I(inode)->location.objectid;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation;
34
35 if (connectable && !S_ISDIR(inode->i_mode)) {
36 struct inode *parent;
37 u64 parent_root_id;
38
39 spin_lock(&dentry->d_lock);
40
41 parent = dentry->d_parent->d_inode;
42 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
43 fid->parent_gen = parent->i_generation;
44 parent_root_id = BTRFS_I(parent)->root->objectid;
45
46 spin_unlock(&dentry->d_lock);
47
48 if (parent_root_id != fid->root_objectid) {
49 fid->parent_root_objectid = parent_root_id;
50 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
51 type = FILEID_BTRFS_WITH_PARENT_ROOT;
52 } else {
53 len = BTRFS_FID_SIZE_CONNECTABLE;
54 type = FILEID_BTRFS_WITH_PARENT;
55 }
56 }
57
58 *max_len = len;
59 return type;
60}
61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation)
64{
65 struct btrfs_root *root;
66 struct inode *inode;
67 struct btrfs_key key;
68
69 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1;
72
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
74 if (IS_ERR(root))
75 return ERR_CAST(root);
76
77 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0;
80
81 inode = btrfs_iget(sb, &key, root, NULL);
82 if (IS_ERR(inode))
83 return (void *)inode;
84
85 if (generation != inode->i_generation) {
86 iput(inode);
87 return ERR_PTR(-ESTALE);
88 }
89
90 return d_obtain_alias(inode);
91}
92
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
94 int fh_len, int fh_type)
95{
96 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
97 u64 objectid, root_objectid;
98 u32 generation;
99
100 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
101 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
102 return NULL;
103 root_objectid = fid->root_objectid;
104 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
105 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
106 return NULL;
107 root_objectid = fid->parent_root_objectid;
108 } else
109 return NULL;
110
111 objectid = fid->parent_objectid;
112 generation = fid->parent_gen;
113
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
115}
116
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
118 int fh_len, int fh_type)
119{
120 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
121 u64 objectid, root_objectid;
122 u32 generation;
123
124 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
126 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
127 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
128 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
129 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
130 return NULL;
131
132 objectid = fid->objectid;
133 root_objectid = fid->root_objectid;
134 generation = fid->gen;
135
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
137}
138
139static struct dentry *btrfs_get_parent(struct dentry *child)
140{
141 struct inode *dir = child->d_inode;
142 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path;
145 struct extent_buffer *leaf;
146 int slot;
147 u64 objectid;
148 int ret;
149
150 path = btrfs_alloc_path();
151
152 key.objectid = dir->i_ino;
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
154 key.offset = (u64)-1;
155
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) {
158 /* Error */
159 btrfs_free_path(path);
160 return ERR_PTR(ret);
161 }
162 leaf = path->nodes[0];
163 slot = path->slots[0];
164 if (ret) {
165 /* btrfs_search_slot() returns the slot where we'd want to
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
167 The _real_ backref, telling us what the parent inode
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 }
177
178 btrfs_item_key_to_cpu(leaf, &key, slot);
179 btrfs_free_path(path);
180
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
182 return ERR_PTR(-EINVAL);
183
184 objectid = key.offset;
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189
190 /* Build a new key for the inode item */
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0;
194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
196}
197
198const struct export_operations btrfs_export_ops = {
199 .encode_fh = btrfs_encode_fh,
200 .fh_to_dentry = btrfs_fh_to_dentry,
201 .fh_to_parent = btrfs_fh_to_parent,
202 .get_parent = btrfs_get_parent,
203};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..293da650873f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include <linux/version.h>
23#include "compat.h"
24#include "hash.h"
25#include "crc32c.h"
26#include "ctree.h"
27#include "disk-io.h"
28#include "print-tree.h"
29#include "transaction.h"
30#include "volumes.h"
31#include "locking.h"
32#include "ref-cache.h"
33#include "compat.h"
34
35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1
37#define PENDING_BACKREF_UPDATE 2
38
39struct pending_extent_op {
40 int type;
41 u64 bytenr;
42 u64 num_bytes;
43 u64 parent;
44 u64 orig_parent;
45 u64 generation;
46 u64 orig_generation;
47 int level;
48 struct list_head list;
49 int del;
50};
51
52static int finish_current_insert(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all);
54static int del_pending_extents(struct btrfs_trans_handle *trans,
55 struct btrfs_root *extent_root, int all);
56static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data);
59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free);
63
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{
66 return (cache->flags & bits) == bits;
67}
68
69/*
70 * this adds the block group to the fs_info rb tree for the block group
71 * cache
72 */
73static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 struct btrfs_block_group_cache *block_group)
75{
76 struct rb_node **p;
77 struct rb_node *parent = NULL;
78 struct btrfs_block_group_cache *cache;
79
80 spin_lock(&info->block_group_cache_lock);
81 p = &info->block_group_cache_tree.rb_node;
82
83 while (*p) {
84 parent = *p;
85 cache = rb_entry(parent, struct btrfs_block_group_cache,
86 cache_node);
87 if (block_group->key.objectid < cache->key.objectid) {
88 p = &(*p)->rb_left;
89 } else if (block_group->key.objectid > cache->key.objectid) {
90 p = &(*p)->rb_right;
91 } else {
92 spin_unlock(&info->block_group_cache_lock);
93 return -EEXIST;
94 }
95 }
96
97 rb_link_node(&block_group->cache_node, parent, p);
98 rb_insert_color(&block_group->cache_node,
99 &info->block_group_cache_tree);
100 spin_unlock(&info->block_group_cache_lock);
101
102 return 0;
103}
104
105/*
106 * This will return the block group at or after bytenr if contains is 0, else
107 * it will return the block group that contains the bytenr
108 */
109static struct btrfs_block_group_cache *
110block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 int contains)
112{
113 struct btrfs_block_group_cache *cache, *ret = NULL;
114 struct rb_node *n;
115 u64 end, start;
116
117 spin_lock(&info->block_group_cache_lock);
118 n = info->block_group_cache_tree.rb_node;
119
120 while (n) {
121 cache = rb_entry(n, struct btrfs_block_group_cache,
122 cache_node);
123 end = cache->key.objectid + cache->key.offset - 1;
124 start = cache->key.objectid;
125
126 if (bytenr < start) {
127 if (!contains && (!ret || start < ret->key.objectid))
128 ret = cache;
129 n = n->rb_left;
130 } else if (bytenr > start) {
131 if (contains && bytenr <= end) {
132 ret = cache;
133 break;
134 }
135 n = n->rb_right;
136 } else {
137 ret = cache;
138 break;
139 }
140 }
141 if (ret)
142 atomic_inc(&ret->count);
143 spin_unlock(&info->block_group_cache_lock);
144
145 return ret;
146}
147
148/*
149 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits.
152 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end)
155{
156 u64 extent_start, extent_end, size;
157 int ret;
158
159 mutex_lock(&info->pinned_mutex);
160 while (start < end) {
161 ret = find_first_extent_bit(&info->pinned_extents, start,
162 &extent_start, &extent_end,
163 EXTENT_DIRTY);
164 if (ret)
165 break;
166
167 if (extent_start == start) {
168 start = extent_end + 1;
169 } else if (extent_start > start && extent_start < end) {
170 size = extent_start - start;
171 ret = btrfs_add_free_space(block_group, start,
172 size);
173 BUG_ON(ret);
174 start = extent_end + 1;
175 } else {
176 break;
177 }
178 }
179
180 if (start < end) {
181 size = end - start;
182 ret = btrfs_add_free_space(block_group, start, size);
183 BUG_ON(ret);
184 }
185 mutex_unlock(&info->pinned_mutex);
186
187 return 0;
188}
189
190static int remove_sb_from_cache(struct btrfs_root *root,
191 struct btrfs_block_group_cache *cache)
192{
193 u64 bytenr;
194 u64 *logical;
195 int stripe_len;
196 int i, nr, ret;
197
198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 bytenr = btrfs_sb_offset(i);
200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
201 cache->key.objectid, bytenr, 0,
202 &logical, &nr, &stripe_len);
203 BUG_ON(ret);
204 while (nr--) {
205 btrfs_remove_free_space(cache, logical[nr],
206 stripe_len);
207 }
208 kfree(logical);
209 }
210 return 0;
211}
212
213static int cache_block_group(struct btrfs_root *root,
214 struct btrfs_block_group_cache *block_group)
215{
216 struct btrfs_path *path;
217 int ret = 0;
218 struct btrfs_key key;
219 struct extent_buffer *leaf;
220 int slot;
221 u64 last;
222
223 if (!block_group)
224 return 0;
225
226 root = root->fs_info->extent_root;
227
228 if (block_group->cached)
229 return 0;
230
231 path = btrfs_alloc_path();
232 if (!path)
233 return -ENOMEM;
234
235 path->reada = 2;
236 /*
237 * we get into deadlocks with paths held by callers of this function.
238 * since the alloc_mutex is protecting things right now, just
239 * skip the locking here
240 */
241 path->skip_locking = 1;
242 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
243 key.objectid = last;
244 key.offset = 0;
245 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
247 if (ret < 0)
248 goto err;
249
250 while (1) {
251 leaf = path->nodes[0];
252 slot = path->slots[0];
253 if (slot >= btrfs_header_nritems(leaf)) {
254 ret = btrfs_next_leaf(root, path);
255 if (ret < 0)
256 goto err;
257 if (ret == 0)
258 continue;
259 else
260 break;
261 }
262 btrfs_item_key_to_cpu(leaf, &key, slot);
263 if (key.objectid < block_group->key.objectid)
264 goto next;
265
266 if (key.objectid >= block_group->key.objectid +
267 block_group->key.offset)
268 break;
269
270 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
271 add_new_free_space(block_group, root->fs_info, last,
272 key.objectid);
273
274 last = key.objectid + key.offset;
275 }
276next:
277 path->slots[0]++;
278 }
279
280 add_new_free_space(block_group, root->fs_info, last,
281 block_group->key.objectid +
282 block_group->key.offset);
283
284 remove_sb_from_cache(root, block_group);
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295static struct btrfs_block_group_cache *
296btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
297{
298 struct btrfs_block_group_cache *cache;
299
300 cache = block_group_cache_tree_search(info, bytenr, 0);
301
302 return cache;
303}
304
305/*
306 * return the block group that contains teh given bytenr
307 */
308struct btrfs_block_group_cache *btrfs_lookup_block_group(
309 struct btrfs_fs_info *info,
310 u64 bytenr)
311{
312 struct btrfs_block_group_cache *cache;
313
314 cache = block_group_cache_tree_search(info, bytenr, 1);
315
316 return cache;
317}
318
319static inline void put_block_group(struct btrfs_block_group_cache *cache)
320{
321 if (atomic_dec_and_test(&cache->count))
322 kfree(cache);
323}
324
325static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags)
327{
328 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found;
331 list_for_each(cur, head) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags)
334 return found;
335 }
336 return NULL;
337}
338
339static u64 div_factor(u64 num, int factor)
340{
341 if (factor == 10)
342 return num;
343 num *= factor;
344 do_div(num, 10);
345 return num;
346}
347
348u64 btrfs_find_block_group(struct btrfs_root *root,
349 u64 search_start, u64 search_hint, int owner)
350{
351 struct btrfs_block_group_cache *cache;
352 u64 used;
353 u64 last = max(search_hint, search_start);
354 u64 group_start = 0;
355 int full_search = 0;
356 int factor = 9;
357 int wrapped = 0;
358again:
359 while (1) {
360 cache = btrfs_lookup_first_block_group(root->fs_info, last);
361 if (!cache)
362 break;
363
364 spin_lock(&cache->lock);
365 last = cache->key.objectid + cache->key.offset;
366 used = btrfs_block_group_used(&cache->item);
367
368 if ((full_search || !cache->ro) &&
369 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
370 if (used + cache->pinned + cache->reserved <
371 div_factor(cache->key.offset, factor)) {
372 group_start = cache->key.objectid;
373 spin_unlock(&cache->lock);
374 put_block_group(cache);
375 goto found;
376 }
377 }
378 spin_unlock(&cache->lock);
379 put_block_group(cache);
380 cond_resched();
381 }
382 if (!wrapped) {
383 last = search_start;
384 wrapped = 1;
385 goto again;
386 }
387 if (!full_search && factor < 10) {
388 last = search_start;
389 full_search = 1;
390 factor = 10;
391 goto again;
392 }
393found:
394 return group_start;
395}
396
397/* simple helper to search for an existing extent at a given offset */
398int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
399{
400 int ret;
401 struct btrfs_key key;
402 struct btrfs_path *path;
403
404 path = btrfs_alloc_path();
405 BUG_ON(!path);
406 key.objectid = start;
407 key.offset = len;
408 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
409 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
410 0, 0);
411 btrfs_free_path(path);
412 return ret;
413}
414
415/*
416 * Back reference rules. Back refs have three main goals:
417 *
418 * 1) differentiate between all holders of references to an extent so that
419 * when a reference is dropped we can make sure it was a valid reference
420 * before freeing the extent.
421 *
422 * 2) Provide enough information to quickly find the holders of an extent
423 * if we notice a given block is corrupted or bad.
424 *
425 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
426 * maintenance. This is actually the same as #2, but with a slightly
427 * different use case.
428 *
429 * File extents can be referenced by:
430 *
431 * - multiple snapshots, subvolumes, or different generations in one subvol
432 * - different files inside a single subvolume
433 * - different offsets inside a file (bookend extents in file.c)
434 *
435 * The extent ref structure has fields for:
436 *
437 * - Objectid of the subvolume root
438 * - Generation number of the tree holding the reference
439 * - objectid of the file holding the reference
440 * - number of references holding by parent node (alway 1 for tree blocks)
441 *
442 * Btree leaf may hold multiple references to a file extent. In most cases,
443 * these references are from same file and the corresponding offsets inside
444 * the file are close together.
445 *
446 * When a file extent is allocated the fields are filled in:
447 * (root_key.objectid, trans->transid, inode objectid, 1)
448 *
449 * When a leaf is cow'd new references are added for every file extent found
450 * in the leaf. It looks similar to the create case, but trans->transid will
451 * be different when the block is cow'd.
452 *
453 * (root_key.objectid, trans->transid, inode objectid,
454 * number of references in the leaf)
455 *
456 * When a file extent is removed either during snapshot deletion or
457 * file truncation, we find the corresponding back reference and check
458 * the following fields:
459 *
460 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
461 * inode objectid)
462 *
463 * Btree extents can be referenced by:
464 *
465 * - Different subvolumes
466 * - Different generations of the same subvolume
467 *
468 * When a tree block is created, back references are inserted:
469 *
470 * (root->root_key.objectid, trans->transid, level, 1)
471 *
472 * When a tree block is cow'd, new back references are added for all the
473 * blocks it points to. If the tree block isn't in reference counted root,
474 * the old back references are removed. These new back references are of
475 * the form (trans->transid will have increased since creation):
476 *
477 * (root->root_key.objectid, trans->transid, level, 1)
478 *
479 * When a backref is in deleting, the following fields are checked:
480 *
481 * if backref was for a tree root:
482 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
483 * else
484 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
485 *
486 * Back Reference Key composing:
487 *
488 * The key objectid corresponds to the first byte in the extent, the key
489 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
490 * byte of parent extent. If a extent is tree root, the key offset is set
491 * to the key objectid.
492 */
493
494static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
495 struct btrfs_root *root,
496 struct btrfs_path *path,
497 u64 bytenr, u64 parent,
498 u64 ref_root, u64 ref_generation,
499 u64 owner_objectid, int del)
500{
501 struct btrfs_key key;
502 struct btrfs_extent_ref *ref;
503 struct extent_buffer *leaf;
504 u64 ref_objectid;
505 int ret;
506
507 key.objectid = bytenr;
508 key.type = BTRFS_EXTENT_REF_KEY;
509 key.offset = parent;
510
511 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
512 if (ret < 0)
513 goto out;
514 if (ret > 0) {
515 ret = -ENOENT;
516 goto out;
517 }
518
519 leaf = path->nodes[0];
520 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
521 ref_objectid = btrfs_ref_objectid(leaf, ref);
522 if (btrfs_ref_root(leaf, ref) != ref_root ||
523 btrfs_ref_generation(leaf, ref) != ref_generation ||
524 (ref_objectid != owner_objectid &&
525 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
526 ret = -EIO;
527 WARN_ON(1);
528 goto out;
529 }
530 ret = 0;
531out:
532 return ret;
533}
534
535/*
536 * updates all the backrefs that are pending on update_list for the
537 * extent_root
538 */
539static noinline int update_backrefs(struct btrfs_trans_handle *trans,
540 struct btrfs_root *extent_root,
541 struct btrfs_path *path,
542 struct list_head *update_list)
543{
544 struct btrfs_key key;
545 struct btrfs_extent_ref *ref;
546 struct btrfs_fs_info *info = extent_root->fs_info;
547 struct pending_extent_op *op;
548 struct extent_buffer *leaf;
549 int ret = 0;
550 struct list_head *cur = update_list->next;
551 u64 ref_objectid;
552 u64 ref_root = extent_root->root_key.objectid;
553
554 op = list_entry(cur, struct pending_extent_op, list);
555
556search:
557 key.objectid = op->bytenr;
558 key.type = BTRFS_EXTENT_REF_KEY;
559 key.offset = op->orig_parent;
560
561 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
562 BUG_ON(ret);
563
564 leaf = path->nodes[0];
565
566loop:
567 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
568
569 ref_objectid = btrfs_ref_objectid(leaf, ref);
570
571 if (btrfs_ref_root(leaf, ref) != ref_root ||
572 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
573 (ref_objectid != op->level &&
574 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
575 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
576 "root %llu, owner %u\n",
577 (unsigned long long)op->bytenr,
578 (unsigned long long)op->orig_parent,
579 (unsigned long long)ref_root, op->level);
580 btrfs_print_leaf(extent_root, leaf);
581 BUG();
582 }
583
584 key.objectid = op->bytenr;
585 key.offset = op->parent;
586 key.type = BTRFS_EXTENT_REF_KEY;
587 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
588 BUG_ON(ret);
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590 btrfs_set_ref_generation(leaf, ref, op->generation);
591
592 cur = cur->next;
593
594 list_del_init(&op->list);
595 unlock_extent(&info->extent_ins, op->bytenr,
596 op->bytenr + op->num_bytes - 1, GFP_NOFS);
597 kfree(op);
598
599 if (cur == update_list) {
600 btrfs_mark_buffer_dirty(path->nodes[0]);
601 btrfs_release_path(extent_root, path);
602 goto out;
603 }
604
605 op = list_entry(cur, struct pending_extent_op, list);
606
607 path->slots[0]++;
608 while (path->slots[0] < btrfs_header_nritems(leaf)) {
609 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
610 if (key.objectid == op->bytenr &&
611 key.type == BTRFS_EXTENT_REF_KEY)
612 goto loop;
613 path->slots[0]++;
614 }
615
616 btrfs_mark_buffer_dirty(path->nodes[0]);
617 btrfs_release_path(extent_root, path);
618 goto search;
619
620out:
621 return 0;
622}
623
624static noinline int insert_extents(struct btrfs_trans_handle *trans,
625 struct btrfs_root *extent_root,
626 struct btrfs_path *path,
627 struct list_head *insert_list, int nr)
628{
629 struct btrfs_key *keys;
630 u32 *data_size;
631 struct pending_extent_op *op;
632 struct extent_buffer *leaf;
633 struct list_head *cur = insert_list->next;
634 struct btrfs_fs_info *info = extent_root->fs_info;
635 u64 ref_root = extent_root->root_key.objectid;
636 int i = 0, last = 0, ret;
637 int total = nr * 2;
638
639 if (!nr)
640 return 0;
641
642 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
643 if (!keys)
644 return -ENOMEM;
645
646 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
647 if (!data_size) {
648 kfree(keys);
649 return -ENOMEM;
650 }
651
652 list_for_each_entry(op, insert_list, list) {
653 keys[i].objectid = op->bytenr;
654 keys[i].offset = op->num_bytes;
655 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
656 data_size[i] = sizeof(struct btrfs_extent_item);
657 i++;
658
659 keys[i].objectid = op->bytenr;
660 keys[i].offset = op->parent;
661 keys[i].type = BTRFS_EXTENT_REF_KEY;
662 data_size[i] = sizeof(struct btrfs_extent_ref);
663 i++;
664 }
665
666 op = list_entry(cur, struct pending_extent_op, list);
667 i = 0;
668 while (i < total) {
669 int c;
670 ret = btrfs_insert_some_items(trans, extent_root, path,
671 keys+i, data_size+i, total-i);
672 BUG_ON(ret < 0);
673
674 if (last && ret > 1)
675 BUG();
676
677 leaf = path->nodes[0];
678 for (c = 0; c < ret; c++) {
679 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
680
681 /*
682 * if the first item we inserted was a backref, then
683 * the EXTENT_ITEM will be the odd c's, else it will
684 * be the even c's
685 */
686 if ((ref_first && (c % 2)) ||
687 (!ref_first && !(c % 2))) {
688 struct btrfs_extent_item *itm;
689
690 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
691 struct btrfs_extent_item);
692 btrfs_set_extent_refs(path->nodes[0], itm, 1);
693 op->del++;
694 } else {
695 struct btrfs_extent_ref *ref;
696
697 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
698 struct btrfs_extent_ref);
699 btrfs_set_ref_root(leaf, ref, ref_root);
700 btrfs_set_ref_generation(leaf, ref,
701 op->generation);
702 btrfs_set_ref_objectid(leaf, ref, op->level);
703 btrfs_set_ref_num_refs(leaf, ref, 1);
704 op->del++;
705 }
706
707 /*
708 * using del to see when its ok to free up the
709 * pending_extent_op. In the case where we insert the
710 * last item on the list in order to help do batching
711 * we need to not free the extent op until we actually
712 * insert the extent_item
713 */
714 if (op->del == 2) {
715 unlock_extent(&info->extent_ins, op->bytenr,
716 op->bytenr + op->num_bytes - 1,
717 GFP_NOFS);
718 cur = cur->next;
719 list_del_init(&op->list);
720 kfree(op);
721 if (cur != insert_list)
722 op = list_entry(cur,
723 struct pending_extent_op,
724 list);
725 }
726 }
727 btrfs_mark_buffer_dirty(leaf);
728 btrfs_release_path(extent_root, path);
729
730 /*
731 * Ok backref's and items usually go right next to eachother,
732 * but if we could only insert 1 item that means that we
733 * inserted on the end of a leaf, and we have no idea what may
734 * be on the next leaf so we just play it safe. In order to
735 * try and help this case we insert the last thing on our
736 * insert list so hopefully it will end up being the last
737 * thing on the leaf and everything else will be before it,
738 * which will let us insert a whole bunch of items at the same
739 * time.
740 */
741 if (ret == 1 && !last && (i + ret < total)) {
742 /*
743 * last: where we will pick up the next time around
744 * i: our current key to insert, will be total - 1
745 * cur: the current op we are screwing with
746 * op: duh
747 */
748 last = i + ret;
749 i = total - 1;
750 cur = insert_list->prev;
751 op = list_entry(cur, struct pending_extent_op, list);
752 } else if (last) {
753 /*
754 * ok we successfully inserted the last item on the
755 * list, lets reset everything
756 *
757 * i: our current key to insert, so where we left off
758 * last time
759 * last: done with this
760 * cur: the op we are messing with
761 * op: duh
762 * total: since we inserted the last key, we need to
763 * decrement total so we dont overflow
764 */
765 i = last;
766 last = 0;
767 total--;
768 if (i < total) {
769 cur = insert_list->next;
770 op = list_entry(cur, struct pending_extent_op,
771 list);
772 }
773 } else {
774 i += ret;
775 }
776
777 cond_resched();
778 }
779 ret = 0;
780 kfree(keys);
781 kfree(data_size);
782 return ret;
783}
784
785static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
786 struct btrfs_root *root,
787 struct btrfs_path *path,
788 u64 bytenr, u64 parent,
789 u64 ref_root, u64 ref_generation,
790 u64 owner_objectid)
791{
792 struct btrfs_key key;
793 struct extent_buffer *leaf;
794 struct btrfs_extent_ref *ref;
795 u32 num_refs;
796 int ret;
797
798 key.objectid = bytenr;
799 key.type = BTRFS_EXTENT_REF_KEY;
800 key.offset = parent;
801
802 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
803 if (ret == 0) {
804 leaf = path->nodes[0];
805 ref = btrfs_item_ptr(leaf, path->slots[0],
806 struct btrfs_extent_ref);
807 btrfs_set_ref_root(leaf, ref, ref_root);
808 btrfs_set_ref_generation(leaf, ref, ref_generation);
809 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
810 btrfs_set_ref_num_refs(leaf, ref, 1);
811 } else if (ret == -EEXIST) {
812 u64 existing_owner;
813 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
814 leaf = path->nodes[0];
815 ref = btrfs_item_ptr(leaf, path->slots[0],
816 struct btrfs_extent_ref);
817 if (btrfs_ref_root(leaf, ref) != ref_root ||
818 btrfs_ref_generation(leaf, ref) != ref_generation) {
819 ret = -EIO;
820 WARN_ON(1);
821 goto out;
822 }
823
824 num_refs = btrfs_ref_num_refs(leaf, ref);
825 BUG_ON(num_refs == 0);
826 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
827
828 existing_owner = btrfs_ref_objectid(leaf, ref);
829 if (existing_owner != owner_objectid &&
830 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
831 btrfs_set_ref_objectid(leaf, ref,
832 BTRFS_MULTIPLE_OBJECTIDS);
833 }
834 ret = 0;
835 } else {
836 goto out;
837 }
838 btrfs_mark_buffer_dirty(path->nodes[0]);
839out:
840 btrfs_release_path(root, path);
841 return ret;
842}
843
844static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
845 struct btrfs_root *root,
846 struct btrfs_path *path)
847{
848 struct extent_buffer *leaf;
849 struct btrfs_extent_ref *ref;
850 u32 num_refs;
851 int ret = 0;
852
853 leaf = path->nodes[0];
854 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
855 num_refs = btrfs_ref_num_refs(leaf, ref);
856 BUG_ON(num_refs == 0);
857 num_refs -= 1;
858 if (num_refs == 0) {
859 ret = btrfs_del_item(trans, root, path);
860 } else {
861 btrfs_set_ref_num_refs(leaf, ref, num_refs);
862 btrfs_mark_buffer_dirty(leaf);
863 }
864 btrfs_release_path(root, path);
865 return ret;
866}
867
868#ifdef BIO_RW_DISCARD
869static void btrfs_issue_discard(struct block_device *bdev,
870 u64 start, u64 len)
871{
872 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
873}
874#endif
875
876static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
877 u64 num_bytes)
878{
879#ifdef BIO_RW_DISCARD
880 int ret;
881 u64 map_length = num_bytes;
882 struct btrfs_multi_bio *multi = NULL;
883
884 /* Tell the block device(s) that the sectors can be discarded */
885 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
886 bytenr, &map_length, &multi, 0);
887 if (!ret) {
888 struct btrfs_bio_stripe *stripe = multi->stripes;
889 int i;
890
891 if (map_length > num_bytes)
892 map_length = num_bytes;
893
894 for (i = 0; i < multi->num_stripes; i++, stripe++) {
895 btrfs_issue_discard(stripe->dev->bdev,
896 stripe->physical,
897 map_length);
898 }
899 kfree(multi);
900 }
901
902 return ret;
903#else
904 return 0;
905#endif
906}
907
908static noinline int free_extents(struct btrfs_trans_handle *trans,
909 struct btrfs_root *extent_root,
910 struct list_head *del_list)
911{
912 struct btrfs_fs_info *info = extent_root->fs_info;
913 struct btrfs_path *path;
914 struct btrfs_key key, found_key;
915 struct extent_buffer *leaf;
916 struct list_head *cur;
917 struct pending_extent_op *op;
918 struct btrfs_extent_item *ei;
919 int ret, num_to_del, extent_slot = 0, found_extent = 0;
920 u32 refs;
921 u64 bytes_freed = 0;
922
923 path = btrfs_alloc_path();
924 if (!path)
925 return -ENOMEM;
926 path->reada = 1;
927
928search:
929 /* search for the backref for the current ref we want to delete */
930 cur = del_list->next;
931 op = list_entry(cur, struct pending_extent_op, list);
932 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
933 op->orig_parent,
934 extent_root->root_key.objectid,
935 op->orig_generation, op->level, 1);
936 if (ret) {
937 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
938 "root %llu gen %llu owner %u\n",
939 (unsigned long long)op->bytenr,
940 (unsigned long long)extent_root->root_key.objectid,
941 (unsigned long long)op->orig_generation, op->level);
942 btrfs_print_leaf(extent_root, path->nodes[0]);
943 WARN_ON(1);
944 goto out;
945 }
946
947 extent_slot = path->slots[0];
948 num_to_del = 1;
949 found_extent = 0;
950
951 /*
952 * if we aren't the first item on the leaf we can move back one and see
953 * if our ref is right next to our extent item
954 */
955 if (likely(extent_slot)) {
956 extent_slot--;
957 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
958 extent_slot);
959 if (found_key.objectid == op->bytenr &&
960 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
961 found_key.offset == op->num_bytes) {
962 num_to_del++;
963 found_extent = 1;
964 }
965 }
966
967 /*
968 * if we didn't find the extent we need to delete the backref and then
969 * search for the extent item key so we can update its ref count
970 */
971 if (!found_extent) {
972 key.objectid = op->bytenr;
973 key.type = BTRFS_EXTENT_ITEM_KEY;
974 key.offset = op->num_bytes;
975
976 ret = remove_extent_backref(trans, extent_root, path);
977 BUG_ON(ret);
978 btrfs_release_path(extent_root, path);
979 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
980 BUG_ON(ret);
981 extent_slot = path->slots[0];
982 }
983
984 /* this is where we update the ref count for the extent */
985 leaf = path->nodes[0];
986 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
987 refs = btrfs_extent_refs(leaf, ei);
988 BUG_ON(refs == 0);
989 refs--;
990 btrfs_set_extent_refs(leaf, ei, refs);
991
992 btrfs_mark_buffer_dirty(leaf);
993
994 /*
995 * This extent needs deleting. The reason cur_slot is extent_slot +
996 * num_to_del is because extent_slot points to the slot where the extent
997 * is, and if the backref was not right next to the extent we will be
998 * deleting at least 1 item, and will want to start searching at the
999 * slot directly next to extent_slot. However if we did find the
1000 * backref next to the extent item them we will be deleting at least 2
1001 * items and will want to start searching directly after the ref slot
1002 */
1003 if (!refs) {
1004 struct list_head *pos, *n, *end;
1005 int cur_slot = extent_slot+num_to_del;
1006 u64 super_used;
1007 u64 root_used;
1008
1009 path->slots[0] = extent_slot;
1010 bytes_freed = op->num_bytes;
1011
1012 mutex_lock(&info->pinned_mutex);
1013 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1014 op->num_bytes, op->level >=
1015 BTRFS_FIRST_FREE_OBJECTID);
1016 mutex_unlock(&info->pinned_mutex);
1017 BUG_ON(ret < 0);
1018 op->del = ret;
1019
1020 /*
1021 * we need to see if we can delete multiple things at once, so
1022 * start looping through the list of extents we are wanting to
1023 * delete and see if their extent/backref's are right next to
1024 * eachother and the extents only have 1 ref
1025 */
1026 for (pos = cur->next; pos != del_list; pos = pos->next) {
1027 struct pending_extent_op *tmp;
1028
1029 tmp = list_entry(pos, struct pending_extent_op, list);
1030
1031 /* we only want to delete extent+ref at this stage */
1032 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1033 break;
1034
1035 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1036 if (found_key.objectid != tmp->bytenr ||
1037 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1038 found_key.offset != tmp->num_bytes)
1039 break;
1040
1041 /* check to make sure this extent only has one ref */
1042 ei = btrfs_item_ptr(leaf, cur_slot,
1043 struct btrfs_extent_item);
1044 if (btrfs_extent_refs(leaf, ei) != 1)
1045 break;
1046
1047 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1048 if (found_key.objectid != tmp->bytenr ||
1049 found_key.type != BTRFS_EXTENT_REF_KEY ||
1050 found_key.offset != tmp->orig_parent)
1051 break;
1052
1053 /*
1054 * the ref is right next to the extent, we can set the
1055 * ref count to 0 since we will delete them both now
1056 */
1057 btrfs_set_extent_refs(leaf, ei, 0);
1058
1059 /* pin down the bytes for this extent */
1060 mutex_lock(&info->pinned_mutex);
1061 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1062 tmp->num_bytes, tmp->level >=
1063 BTRFS_FIRST_FREE_OBJECTID);
1064 mutex_unlock(&info->pinned_mutex);
1065 BUG_ON(ret < 0);
1066
1067 /*
1068 * use the del field to tell if we need to go ahead and
1069 * free up the extent when we delete the item or not.
1070 */
1071 tmp->del = ret;
1072 bytes_freed += tmp->num_bytes;
1073
1074 num_to_del += 2;
1075 cur_slot += 2;
1076 }
1077 end = pos;
1078
1079 /* update the free space counters */
1080 spin_lock(&info->delalloc_lock);
1081 super_used = btrfs_super_bytes_used(&info->super_copy);
1082 btrfs_set_super_bytes_used(&info->super_copy,
1083 super_used - bytes_freed);
1084
1085 root_used = btrfs_root_used(&extent_root->root_item);
1086 btrfs_set_root_used(&extent_root->root_item,
1087 root_used - bytes_freed);
1088 spin_unlock(&info->delalloc_lock);
1089
1090 /* delete the items */
1091 ret = btrfs_del_items(trans, extent_root, path,
1092 path->slots[0], num_to_del);
1093 BUG_ON(ret);
1094
1095 /*
1096 * loop through the extents we deleted and do the cleanup work
1097 * on them
1098 */
1099 for (pos = cur, n = pos->next; pos != end;
1100 pos = n, n = pos->next) {
1101 struct pending_extent_op *tmp;
1102 tmp = list_entry(pos, struct pending_extent_op, list);
1103
1104 /*
1105 * remember tmp->del tells us wether or not we pinned
1106 * down the extent
1107 */
1108 ret = update_block_group(trans, extent_root,
1109 tmp->bytenr, tmp->num_bytes, 0,
1110 tmp->del);
1111 BUG_ON(ret);
1112
1113 list_del_init(&tmp->list);
1114 unlock_extent(&info->extent_ins, tmp->bytenr,
1115 tmp->bytenr + tmp->num_bytes - 1,
1116 GFP_NOFS);
1117 kfree(tmp);
1118 }
1119 } else if (refs && found_extent) {
1120 /*
1121 * the ref and extent were right next to eachother, but the
1122 * extent still has a ref, so just free the backref and keep
1123 * going
1124 */
1125 ret = remove_extent_backref(trans, extent_root, path);
1126 BUG_ON(ret);
1127
1128 list_del_init(&op->list);
1129 unlock_extent(&info->extent_ins, op->bytenr,
1130 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1131 kfree(op);
1132 } else {
1133 /*
1134 * the extent has multiple refs and the backref we were looking
1135 * for was not right next to it, so just unlock and go next,
1136 * we're good to go
1137 */
1138 list_del_init(&op->list);
1139 unlock_extent(&info->extent_ins, op->bytenr,
1140 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1141 kfree(op);
1142 }
1143
1144 btrfs_release_path(extent_root, path);
1145 if (!list_empty(del_list))
1146 goto search;
1147
1148out:
1149 btrfs_free_path(path);
1150 return ret;
1151}
1152
1153static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1154 struct btrfs_root *root, u64 bytenr,
1155 u64 orig_parent, u64 parent,
1156 u64 orig_root, u64 ref_root,
1157 u64 orig_generation, u64 ref_generation,
1158 u64 owner_objectid)
1159{
1160 int ret;
1161 struct btrfs_root *extent_root = root->fs_info->extent_root;
1162 struct btrfs_path *path;
1163
1164 if (root == root->fs_info->extent_root) {
1165 struct pending_extent_op *extent_op;
1166 u64 num_bytes;
1167
1168 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1169 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1170 mutex_lock(&root->fs_info->extent_ins_mutex);
1171 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1172 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1173 u64 priv;
1174 ret = get_state_private(&root->fs_info->extent_ins,
1175 bytenr, &priv);
1176 BUG_ON(ret);
1177 extent_op = (struct pending_extent_op *)
1178 (unsigned long)priv;
1179 BUG_ON(extent_op->parent != orig_parent);
1180 BUG_ON(extent_op->generation != orig_generation);
1181
1182 extent_op->parent = parent;
1183 extent_op->generation = ref_generation;
1184 } else {
1185 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1186 BUG_ON(!extent_op);
1187
1188 extent_op->type = PENDING_BACKREF_UPDATE;
1189 extent_op->bytenr = bytenr;
1190 extent_op->num_bytes = num_bytes;
1191 extent_op->parent = parent;
1192 extent_op->orig_parent = orig_parent;
1193 extent_op->generation = ref_generation;
1194 extent_op->orig_generation = orig_generation;
1195 extent_op->level = (int)owner_objectid;
1196 INIT_LIST_HEAD(&extent_op->list);
1197 extent_op->del = 0;
1198
1199 set_extent_bits(&root->fs_info->extent_ins,
1200 bytenr, bytenr + num_bytes - 1,
1201 EXTENT_WRITEBACK, GFP_NOFS);
1202 set_state_private(&root->fs_info->extent_ins,
1203 bytenr, (unsigned long)extent_op);
1204 }
1205 mutex_unlock(&root->fs_info->extent_ins_mutex);
1206 return 0;
1207 }
1208
1209 path = btrfs_alloc_path();
1210 if (!path)
1211 return -ENOMEM;
1212 ret = lookup_extent_backref(trans, extent_root, path,
1213 bytenr, orig_parent, orig_root,
1214 orig_generation, owner_objectid, 1);
1215 if (ret)
1216 goto out;
1217 ret = remove_extent_backref(trans, extent_root, path);
1218 if (ret)
1219 goto out;
1220 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1221 parent, ref_root, ref_generation,
1222 owner_objectid);
1223 BUG_ON(ret);
1224 finish_current_insert(trans, extent_root, 0);
1225 del_pending_extents(trans, extent_root, 0);
1226out:
1227 btrfs_free_path(path);
1228 return ret;
1229}
1230
1231int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1232 struct btrfs_root *root, u64 bytenr,
1233 u64 orig_parent, u64 parent,
1234 u64 ref_root, u64 ref_generation,
1235 u64 owner_objectid)
1236{
1237 int ret;
1238 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1239 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1240 return 0;
1241 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1242 parent, ref_root, ref_root,
1243 ref_generation, ref_generation,
1244 owner_objectid);
1245 return ret;
1246}
1247
1248static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1249 struct btrfs_root *root, u64 bytenr,
1250 u64 orig_parent, u64 parent,
1251 u64 orig_root, u64 ref_root,
1252 u64 orig_generation, u64 ref_generation,
1253 u64 owner_objectid)
1254{
1255 struct btrfs_path *path;
1256 int ret;
1257 struct btrfs_key key;
1258 struct extent_buffer *l;
1259 struct btrfs_extent_item *item;
1260 u32 refs;
1261
1262 path = btrfs_alloc_path();
1263 if (!path)
1264 return -ENOMEM;
1265
1266 path->reada = 1;
1267 key.objectid = bytenr;
1268 key.type = BTRFS_EXTENT_ITEM_KEY;
1269 key.offset = (u64)-1;
1270
1271 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1272 0, 1);
1273 if (ret < 0)
1274 return ret;
1275 BUG_ON(ret == 0 || path->slots[0] == 0);
1276
1277 path->slots[0]--;
1278 l = path->nodes[0];
1279
1280 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1281 if (key.objectid != bytenr) {
1282 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1283 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1284 (unsigned long long)bytenr,
1285 (unsigned long long)key.objectid);
1286 BUG();
1287 }
1288 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1289
1290 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1291 refs = btrfs_extent_refs(l, item);
1292 btrfs_set_extent_refs(l, item, refs + 1);
1293 btrfs_mark_buffer_dirty(path->nodes[0]);
1294
1295 btrfs_release_path(root->fs_info->extent_root, path);
1296
1297 path->reada = 1;
1298 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1299 path, bytenr, parent,
1300 ref_root, ref_generation,
1301 owner_objectid);
1302 BUG_ON(ret);
1303 finish_current_insert(trans, root->fs_info->extent_root, 0);
1304 del_pending_extents(trans, root->fs_info->extent_root, 0);
1305
1306 btrfs_free_path(path);
1307 return 0;
1308}
1309
1310int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1311 struct btrfs_root *root,
1312 u64 bytenr, u64 num_bytes, u64 parent,
1313 u64 ref_root, u64 ref_generation,
1314 u64 owner_objectid)
1315{
1316 int ret;
1317 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1318 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1319 return 0;
1320 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1321 0, ref_root, 0, ref_generation,
1322 owner_objectid);
1323 return ret;
1324}
1325
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1);
1330 del_pending_extents(trans, root->fs_info->extent_root, 1);
1331 return 0;
1332}
1333
1334int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1335 struct btrfs_root *root, u64 bytenr,
1336 u64 num_bytes, u32 *refs)
1337{
1338 struct btrfs_path *path;
1339 int ret;
1340 struct btrfs_key key;
1341 struct extent_buffer *l;
1342 struct btrfs_extent_item *item;
1343
1344 WARN_ON(num_bytes < root->sectorsize);
1345 path = btrfs_alloc_path();
1346 path->reada = 1;
1347 key.objectid = bytenr;
1348 key.offset = num_bytes;
1349 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1350 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1351 0, 0);
1352 if (ret < 0)
1353 goto out;
1354 if (ret != 0) {
1355 btrfs_print_leaf(root, path->nodes[0]);
1356 printk(KERN_INFO "btrfs failed to find block number %llu\n",
1357 (unsigned long long)bytenr);
1358 BUG();
1359 }
1360 l = path->nodes[0];
1361 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1362 *refs = btrfs_extent_refs(l, item);
1363out:
1364 btrfs_free_path(path);
1365 return 0;
1366}
1367
1368int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1369 struct btrfs_root *root, u64 objectid, u64 bytenr)
1370{
1371 struct btrfs_root *extent_root = root->fs_info->extent_root;
1372 struct btrfs_path *path;
1373 struct extent_buffer *leaf;
1374 struct btrfs_extent_ref *ref_item;
1375 struct btrfs_key key;
1376 struct btrfs_key found_key;
1377 u64 ref_root;
1378 u64 last_snapshot;
1379 u32 nritems;
1380 int ret;
1381
1382 key.objectid = bytenr;
1383 key.offset = (u64)-1;
1384 key.type = BTRFS_EXTENT_ITEM_KEY;
1385
1386 path = btrfs_alloc_path();
1387 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1388 if (ret < 0)
1389 goto out;
1390 BUG_ON(ret == 0);
1391
1392 ret = -ENOENT;
1393 if (path->slots[0] == 0)
1394 goto out;
1395
1396 path->slots[0]--;
1397 leaf = path->nodes[0];
1398 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1399
1400 if (found_key.objectid != bytenr ||
1401 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1402 goto out;
1403
1404 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1405 while (1) {
1406 leaf = path->nodes[0];
1407 nritems = btrfs_header_nritems(leaf);
1408 if (path->slots[0] >= nritems) {
1409 ret = btrfs_next_leaf(extent_root, path);
1410 if (ret < 0)
1411 goto out;
1412 if (ret == 0)
1413 continue;
1414 break;
1415 }
1416 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1417 if (found_key.objectid != bytenr)
1418 break;
1419
1420 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1421 path->slots[0]++;
1422 continue;
1423 }
1424
1425 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1426 struct btrfs_extent_ref);
1427 ref_root = btrfs_ref_root(leaf, ref_item);
1428 if ((ref_root != root->root_key.objectid &&
1429 ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1430 objectid != btrfs_ref_objectid(leaf, ref_item)) {
1431 ret = 1;
1432 goto out;
1433 }
1434 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1435 ret = 1;
1436 goto out;
1437 }
1438
1439 path->slots[0]++;
1440 }
1441 ret = 0;
1442out:
1443 btrfs_free_path(path);
1444 return ret;
1445}
1446
1447int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1448 struct extent_buffer *buf, u32 nr_extents)
1449{
1450 struct btrfs_key key;
1451 struct btrfs_file_extent_item *fi;
1452 u64 root_gen;
1453 u32 nritems;
1454 int i;
1455 int level;
1456 int ret = 0;
1457 int shared = 0;
1458
1459 if (!root->ref_cows)
1460 return 0;
1461
1462 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1463 shared = 0;
1464 root_gen = root->root_key.offset;
1465 } else {
1466 shared = 1;
1467 root_gen = trans->transid - 1;
1468 }
1469
1470 level = btrfs_header_level(buf);
1471 nritems = btrfs_header_nritems(buf);
1472
1473 if (level == 0) {
1474 struct btrfs_leaf_ref *ref;
1475 struct btrfs_extent_info *info;
1476
1477 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1478 if (!ref) {
1479 ret = -ENOMEM;
1480 goto out;
1481 }
1482
1483 ref->root_gen = root_gen;
1484 ref->bytenr = buf->start;
1485 ref->owner = btrfs_header_owner(buf);
1486 ref->generation = btrfs_header_generation(buf);
1487 ref->nritems = nr_extents;
1488 info = ref->extents;
1489
1490 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1491 u64 disk_bytenr;
1492 btrfs_item_key_to_cpu(buf, &key, i);
1493 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1494 continue;
1495 fi = btrfs_item_ptr(buf, i,
1496 struct btrfs_file_extent_item);
1497 if (btrfs_file_extent_type(buf, fi) ==
1498 BTRFS_FILE_EXTENT_INLINE)
1499 continue;
1500 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1501 if (disk_bytenr == 0)
1502 continue;
1503
1504 info->bytenr = disk_bytenr;
1505 info->num_bytes =
1506 btrfs_file_extent_disk_num_bytes(buf, fi);
1507 info->objectid = key.objectid;
1508 info->offset = key.offset;
1509 info++;
1510 }
1511
1512 ret = btrfs_add_leaf_ref(root, ref, shared);
1513 if (ret == -EEXIST && shared) {
1514 struct btrfs_leaf_ref *old;
1515 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1516 BUG_ON(!old);
1517 btrfs_remove_leaf_ref(root, old);
1518 btrfs_free_leaf_ref(root, old);
1519 ret = btrfs_add_leaf_ref(root, ref, shared);
1520 }
1521 WARN_ON(ret);
1522 btrfs_free_leaf_ref(root, ref);
1523 }
1524out:
1525 return ret;
1526}
1527
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1530 u32 *nr_extents)
1531{
1532 u64 bytenr;
1533 u64 ref_root;
1534 u64 orig_root;
1535 u64 ref_generation;
1536 u64 orig_generation;
1537 u32 nritems;
1538 u32 nr_file_extents = 0;
1539 struct btrfs_key key;
1540 struct btrfs_file_extent_item *fi;
1541 int i;
1542 int level;
1543 int ret = 0;
1544 int faili = 0;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64);
1547
1548 ref_root = btrfs_header_owner(buf);
1549 ref_generation = btrfs_header_generation(buf);
1550 orig_root = btrfs_header_owner(orig_buf);
1551 orig_generation = btrfs_header_generation(orig_buf);
1552
1553 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf);
1555
1556 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref;
1558 } else {
1559 if (level == 0 &&
1560 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1561 goto out;
1562 if (level != 0 &&
1563 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1564 goto out;
1565 process_func = __btrfs_update_extent_ref;
1566 }
1567
1568 for (i = 0; i < nritems; i++) {
1569 cond_resched();
1570 if (level == 0) {
1571 btrfs_item_key_to_cpu(buf, &key, i);
1572 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1573 continue;
1574 fi = btrfs_item_ptr(buf, i,
1575 struct btrfs_file_extent_item);
1576 if (btrfs_file_extent_type(buf, fi) ==
1577 BTRFS_FILE_EXTENT_INLINE)
1578 continue;
1579 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1580 if (bytenr == 0)
1581 continue;
1582
1583 nr_file_extents++;
1584
1585 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start,
1587 orig_root, ref_root,
1588 orig_generation, ref_generation,
1589 key.objectid);
1590
1591 if (ret) {
1592 faili = i;
1593 WARN_ON(1);
1594 goto fail;
1595 }
1596 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start,
1600 orig_root, ref_root,
1601 orig_generation, ref_generation,
1602 level - 1);
1603 if (ret) {
1604 faili = i;
1605 WARN_ON(1);
1606 goto fail;
1607 }
1608 }
1609 }
1610out:
1611 if (nr_extents) {
1612 if (level == 0)
1613 *nr_extents = nr_file_extents;
1614 else
1615 *nr_extents = nritems;
1616 }
1617 return 0;
1618fail:
1619 WARN_ON(1);
1620 return ret;
1621}
1622
1623int btrfs_update_ref(struct btrfs_trans_handle *trans,
1624 struct btrfs_root *root, struct extent_buffer *orig_buf,
1625 struct extent_buffer *buf, int start_slot, int nr)
1626
1627{
1628 u64 bytenr;
1629 u64 ref_root;
1630 u64 orig_root;
1631 u64 ref_generation;
1632 u64 orig_generation;
1633 struct btrfs_key key;
1634 struct btrfs_file_extent_item *fi;
1635 int i;
1636 int ret;
1637 int slot;
1638 int level;
1639
1640 BUG_ON(start_slot < 0);
1641 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1642
1643 ref_root = btrfs_header_owner(buf);
1644 ref_generation = btrfs_header_generation(buf);
1645 orig_root = btrfs_header_owner(orig_buf);
1646 orig_generation = btrfs_header_generation(orig_buf);
1647 level = btrfs_header_level(buf);
1648
1649 if (!root->ref_cows) {
1650 if (level == 0 &&
1651 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1652 return 0;
1653 if (level != 0 &&
1654 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1655 return 0;
1656 }
1657
1658 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1659 cond_resched();
1660 if (level == 0) {
1661 btrfs_item_key_to_cpu(buf, &key, slot);
1662 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1663 continue;
1664 fi = btrfs_item_ptr(buf, slot,
1665 struct btrfs_file_extent_item);
1666 if (btrfs_file_extent_type(buf, fi) ==
1667 BTRFS_FILE_EXTENT_INLINE)
1668 continue;
1669 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1670 if (bytenr == 0)
1671 continue;
1672 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1673 orig_buf->start, buf->start,
1674 orig_root, ref_root,
1675 orig_generation, ref_generation,
1676 key.objectid);
1677 if (ret)
1678 goto fail;
1679 } else {
1680 bytenr = btrfs_node_blockptr(buf, slot);
1681 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1682 orig_buf->start, buf->start,
1683 orig_root, ref_root,
1684 orig_generation, ref_generation,
1685 level - 1);
1686 if (ret)
1687 goto fail;
1688 }
1689 }
1690 return 0;
1691fail:
1692 WARN_ON(1);
1693 return -1;
1694}
1695
1696static int write_one_cache_group(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root,
1698 struct btrfs_path *path,
1699 struct btrfs_block_group_cache *cache)
1700{
1701 int ret;
1702 int pending_ret;
1703 struct btrfs_root *extent_root = root->fs_info->extent_root;
1704 unsigned long bi;
1705 struct extent_buffer *leaf;
1706
1707 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1708 if (ret < 0)
1709 goto fail;
1710 BUG_ON(ret);
1711
1712 leaf = path->nodes[0];
1713 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1714 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1715 btrfs_mark_buffer_dirty(leaf);
1716 btrfs_release_path(extent_root, path);
1717fail:
1718 finish_current_insert(trans, extent_root, 0);
1719 pending_ret = del_pending_extents(trans, extent_root, 0);
1720 if (ret)
1721 return ret;
1722 if (pending_ret)
1723 return pending_ret;
1724 return 0;
1725
1726}
1727
1728int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1729 struct btrfs_root *root)
1730{
1731 struct btrfs_block_group_cache *cache, *entry;
1732 struct rb_node *n;
1733 int err = 0;
1734 int werr = 0;
1735 struct btrfs_path *path;
1736 u64 last = 0;
1737
1738 path = btrfs_alloc_path();
1739 if (!path)
1740 return -ENOMEM;
1741
1742 while (1) {
1743 cache = NULL;
1744 spin_lock(&root->fs_info->block_group_cache_lock);
1745 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1746 n; n = rb_next(n)) {
1747 entry = rb_entry(n, struct btrfs_block_group_cache,
1748 cache_node);
1749 if (entry->dirty) {
1750 cache = entry;
1751 break;
1752 }
1753 }
1754 spin_unlock(&root->fs_info->block_group_cache_lock);
1755
1756 if (!cache)
1757 break;
1758
1759 cache->dirty = 0;
1760 last += cache->key.offset;
1761
1762 err = write_one_cache_group(trans, root,
1763 path, cache);
1764 /*
1765 * if we fail to write the cache group, we want
1766 * to keep it marked dirty in hopes that a later
1767 * write will work
1768 */
1769 if (err) {
1770 werr = err;
1771 continue;
1772 }
1773 }
1774 btrfs_free_path(path);
1775 return werr;
1776}
1777
1778int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1779{
1780 struct btrfs_block_group_cache *block_group;
1781 int readonly = 0;
1782
1783 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1784 if (!block_group || block_group->ro)
1785 readonly = 1;
1786 if (block_group)
1787 put_block_group(block_group);
1788 return readonly;
1789}
1790
1791static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1792 u64 total_bytes, u64 bytes_used,
1793 struct btrfs_space_info **space_info)
1794{
1795 struct btrfs_space_info *found;
1796
1797 found = __find_space_info(info, flags);
1798 if (found) {
1799 spin_lock(&found->lock);
1800 found->total_bytes += total_bytes;
1801 found->bytes_used += bytes_used;
1802 found->full = 0;
1803 spin_unlock(&found->lock);
1804 *space_info = found;
1805 return 0;
1806 }
1807 found = kzalloc(sizeof(*found), GFP_NOFS);
1808 if (!found)
1809 return -ENOMEM;
1810
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock);
1815 found->flags = flags;
1816 found->total_bytes = total_bytes;
1817 found->bytes_used = bytes_used;
1818 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0;
1821 found->full = 0;
1822 found->force_alloc = 0;
1823 *space_info = found;
1824 return 0;
1825}
1826
1827static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1828{
1829 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1830 BTRFS_BLOCK_GROUP_RAID1 |
1831 BTRFS_BLOCK_GROUP_RAID10 |
1832 BTRFS_BLOCK_GROUP_DUP);
1833 if (extra_flags) {
1834 if (flags & BTRFS_BLOCK_GROUP_DATA)
1835 fs_info->avail_data_alloc_bits |= extra_flags;
1836 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1837 fs_info->avail_metadata_alloc_bits |= extra_flags;
1838 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1839 fs_info->avail_system_alloc_bits |= extra_flags;
1840 }
1841}
1842
1843static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1844{
1845 spin_lock(&cache->space_info->lock);
1846 spin_lock(&cache->lock);
1847 if (!cache->ro) {
1848 cache->space_info->bytes_readonly += cache->key.offset -
1849 btrfs_block_group_used(&cache->item);
1850 cache->ro = 1;
1851 }
1852 spin_unlock(&cache->lock);
1853 spin_unlock(&cache->space_info->lock);
1854}
1855
1856u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1857{
1858 u64 num_devices = root->fs_info->fs_devices->rw_devices;
1859
1860 if (num_devices == 1)
1861 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1862 if (num_devices < 4)
1863 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1864
1865 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1866 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1867 BTRFS_BLOCK_GROUP_RAID10))) {
1868 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1869 }
1870
1871 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1872 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1873 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1874 }
1875
1876 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1877 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1878 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1879 (flags & BTRFS_BLOCK_GROUP_DUP)))
1880 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1881 return flags;
1882}
1883
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force)
1887{
1888 struct btrfs_space_info *space_info;
1889 u64 thresh;
1890 int ret = 0;
1891
1892 mutex_lock(&extent_root->fs_info->chunk_mutex);
1893
1894 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1895
1896 space_info = __find_space_info(extent_root->fs_info, flags);
1897 if (!space_info) {
1898 ret = update_space_info(extent_root->fs_info, flags,
1899 0, 0, &space_info);
1900 BUG_ON(ret);
1901 }
1902 BUG_ON(!space_info);
1903
1904 spin_lock(&space_info->lock);
1905 if (space_info->force_alloc) {
1906 force = 1;
1907 space_info->force_alloc = 0;
1908 }
1909 if (space_info->full) {
1910 spin_unlock(&space_info->lock);
1911 goto out;
1912 }
1913
1914 thresh = space_info->total_bytes - space_info->bytes_readonly;
1915 thresh = div_factor(thresh, 6);
1916 if (!force &&
1917 (space_info->bytes_used + space_info->bytes_pinned +
1918 space_info->bytes_reserved + alloc_bytes) < thresh) {
1919 spin_unlock(&space_info->lock);
1920 goto out;
1921 }
1922 spin_unlock(&space_info->lock);
1923
1924 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1925 if (ret)
1926 space_info->full = 1;
1927out:
1928 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1929 return ret;
1930}
1931
1932static int update_block_group(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root,
1934 u64 bytenr, u64 num_bytes, int alloc,
1935 int mark_free)
1936{
1937 struct btrfs_block_group_cache *cache;
1938 struct btrfs_fs_info *info = root->fs_info;
1939 u64 total = num_bytes;
1940 u64 old_val;
1941 u64 byte_in_group;
1942
1943 while (total) {
1944 cache = btrfs_lookup_block_group(info, bytenr);
1945 if (!cache)
1946 return -1;
1947 byte_in_group = bytenr - cache->key.objectid;
1948 WARN_ON(byte_in_group > cache->key.offset);
1949
1950 spin_lock(&cache->space_info->lock);
1951 spin_lock(&cache->lock);
1952 cache->dirty = 1;
1953 old_val = btrfs_block_group_used(&cache->item);
1954 num_bytes = min(total, cache->key.offset - byte_in_group);
1955 if (alloc) {
1956 old_val += num_bytes;
1957 cache->space_info->bytes_used += num_bytes;
1958 if (cache->ro)
1959 cache->space_info->bytes_readonly -= num_bytes;
1960 btrfs_set_block_group_used(&cache->item, old_val);
1961 spin_unlock(&cache->lock);
1962 spin_unlock(&cache->space_info->lock);
1963 } else {
1964 old_val -= num_bytes;
1965 cache->space_info->bytes_used -= num_bytes;
1966 if (cache->ro)
1967 cache->space_info->bytes_readonly += num_bytes;
1968 btrfs_set_block_group_used(&cache->item, old_val);
1969 spin_unlock(&cache->lock);
1970 spin_unlock(&cache->space_info->lock);
1971 if (mark_free) {
1972 int ret;
1973
1974 ret = btrfs_discard_extent(root, bytenr,
1975 num_bytes);
1976 WARN_ON(ret);
1977
1978 ret = btrfs_add_free_space(cache, bytenr,
1979 num_bytes);
1980 WARN_ON(ret);
1981 }
1982 }
1983 put_block_group(cache);
1984 total -= num_bytes;
1985 bytenr += num_bytes;
1986 }
1987 return 0;
1988}
1989
1990static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1991{
1992 struct btrfs_block_group_cache *cache;
1993 u64 bytenr;
1994
1995 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1996 if (!cache)
1997 return 0;
1998
1999 bytenr = cache->key.objectid;
2000 put_block_group(cache);
2001
2002 return bytenr;
2003}
2004
2005int btrfs_update_pinned_extents(struct btrfs_root *root,
2006 u64 bytenr, u64 num, int pin)
2007{
2008 u64 len;
2009 struct btrfs_block_group_cache *cache;
2010 struct btrfs_fs_info *fs_info = root->fs_info;
2011
2012 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2013 if (pin) {
2014 set_extent_dirty(&fs_info->pinned_extents,
2015 bytenr, bytenr + num - 1, GFP_NOFS);
2016 } else {
2017 clear_extent_dirty(&fs_info->pinned_extents,
2018 bytenr, bytenr + num - 1, GFP_NOFS);
2019 }
2020 while (num > 0) {
2021 cache = btrfs_lookup_block_group(fs_info, bytenr);
2022 BUG_ON(!cache);
2023 len = min(num, cache->key.offset -
2024 (bytenr - cache->key.objectid));
2025 if (pin) {
2026 spin_lock(&cache->space_info->lock);
2027 spin_lock(&cache->lock);
2028 cache->pinned += len;
2029 cache->space_info->bytes_pinned += len;
2030 spin_unlock(&cache->lock);
2031 spin_unlock(&cache->space_info->lock);
2032 fs_info->total_pinned += len;
2033 } else {
2034 spin_lock(&cache->space_info->lock);
2035 spin_lock(&cache->lock);
2036 cache->pinned -= len;
2037 cache->space_info->bytes_pinned -= len;
2038 spin_unlock(&cache->lock);
2039 spin_unlock(&cache->space_info->lock);
2040 fs_info->total_pinned -= len;
2041 if (cache->cached)
2042 btrfs_add_free_space(cache, bytenr, len);
2043 }
2044 put_block_group(cache);
2045 bytenr += len;
2046 num -= len;
2047 }
2048 return 0;
2049}
2050
2051static int update_reserved_extents(struct btrfs_root *root,
2052 u64 bytenr, u64 num, int reserve)
2053{
2054 u64 len;
2055 struct btrfs_block_group_cache *cache;
2056 struct btrfs_fs_info *fs_info = root->fs_info;
2057
2058 while (num > 0) {
2059 cache = btrfs_lookup_block_group(fs_info, bytenr);
2060 BUG_ON(!cache);
2061 len = min(num, cache->key.offset -
2062 (bytenr - cache->key.objectid));
2063
2064 spin_lock(&cache->space_info->lock);
2065 spin_lock(&cache->lock);
2066 if (reserve) {
2067 cache->reserved += len;
2068 cache->space_info->bytes_reserved += len;
2069 } else {
2070 cache->reserved -= len;
2071 cache->space_info->bytes_reserved -= len;
2072 }
2073 spin_unlock(&cache->lock);
2074 spin_unlock(&cache->space_info->lock);
2075 put_block_group(cache);
2076 bytenr += len;
2077 num -= len;
2078 }
2079 return 0;
2080}
2081
2082int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2083{
2084 u64 last = 0;
2085 u64 start;
2086 u64 end;
2087 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2088 int ret;
2089
2090 mutex_lock(&root->fs_info->pinned_mutex);
2091 while (1) {
2092 ret = find_first_extent_bit(pinned_extents, last,
2093 &start, &end, EXTENT_DIRTY);
2094 if (ret)
2095 break;
2096 set_extent_dirty(copy, start, end, GFP_NOFS);
2097 last = end + 1;
2098 }
2099 mutex_unlock(&root->fs_info->pinned_mutex);
2100 return 0;
2101}
2102
2103int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2104 struct btrfs_root *root,
2105 struct extent_io_tree *unpin)
2106{
2107 u64 start;
2108 u64 end;
2109 int ret;
2110
2111 mutex_lock(&root->fs_info->pinned_mutex);
2112 while (1) {
2113 ret = find_first_extent_bit(unpin, 0, &start, &end,
2114 EXTENT_DIRTY);
2115 if (ret)
2116 break;
2117
2118 ret = btrfs_discard_extent(root, start, end + 1 - start);
2119
2120 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2121 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2122
2123 if (need_resched()) {
2124 mutex_unlock(&root->fs_info->pinned_mutex);
2125 cond_resched();
2126 mutex_lock(&root->fs_info->pinned_mutex);
2127 }
2128 }
2129 mutex_unlock(&root->fs_info->pinned_mutex);
2130 return ret;
2131}
2132
2133static int finish_current_insert(struct btrfs_trans_handle *trans,
2134 struct btrfs_root *extent_root, int all)
2135{
2136 u64 start;
2137 u64 end;
2138 u64 priv;
2139 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list;
2145 int ret;
2146 int num_inserts = 0, max_inserts;
2147
2148 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list);
2150 INIT_LIST_HEAD(&update_list);
2151
2152 max_inserts = extent_root->leafsize /
2153 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2154 sizeof(struct btrfs_extent_ref) +
2155 sizeof(struct btrfs_extent_item));
2156again:
2157 mutex_lock(&info->extent_ins_mutex);
2158 while (1) {
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK);
2161 if (ret) {
2162 if (skipped && all && !num_inserts) {
2163 skipped = 0;
2164 search = 0;
2165 continue;
2166 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break;
2169 }
2170
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) {
2173 skipped = 1;
2174 search = end + 1;
2175 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex);
2177 cond_resched();
2178 mutex_lock(&info->extent_ins_mutex);
2179 }
2180 continue;
2181 }
2182
2183 ret = get_state_private(&info->extent_ins, start, &priv);
2184 BUG_ON(ret);
2185 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2186
2187 if (extent_op->type == PENDING_EXTENT_INSERT) {
2188 num_inserts++;
2189 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1;
2191 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex);
2193 break;
2194 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2196 list_add_tail(&extent_op->list, &update_list);
2197 search = end + 1;
2198 } else {
2199 BUG();
2200 }
2201 }
2202
2203 /*
2204 * process the update list, clear the writeback bit for it, and if
2205 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it
2207 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1,
2212 EXTENT_WRITEBACK, GFP_NOFS);
2213 if (extent_op->del) {
2214 list_del_init(&extent_op->list);
2215 unlock_extent(&info->extent_ins, extent_op->bytenr,
2216 extent_op->bytenr + extent_op->num_bytes
2217 - 1, GFP_NOFS);
2218 kfree(extent_op);
2219 }
2220 }
2221 mutex_unlock(&info->extent_ins_mutex);
2222
2223 /*
2224 * still have things left on the update list, go ahead an update
2225 * everything
2226 */
2227 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret);
2230 }
2231
2232 /*
2233 * if no inserts need to be done, but we skipped some extents and we
2234 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning
2236 */
2237 if (!num_inserts && all && skipped) {
2238 search = 0;
2239 skipped = 0;
2240 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list);
2242 goto again;
2243 } else if (!num_inserts) {
2244 goto out;
2245 }
2246
2247 /*
2248 * process the insert extents list. Again if we are deleting this
2249 * extent, then just unlock it, pin down the bytes if need be, and be
2250 * done with it. Saves us from having to actually insert the extent
2251 * into the tree and then subsequently come along and delete it
2252 */
2253 mutex_lock(&info->extent_ins_mutex);
2254 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2255 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2256 extent_op->bytenr + extent_op->num_bytes - 1,
2257 EXTENT_WRITEBACK, GFP_NOFS);
2258 if (extent_op->del) {
2259 u64 used;
2260 list_del_init(&extent_op->list);
2261 unlock_extent(&info->extent_ins, extent_op->bytenr,
2262 extent_op->bytenr + extent_op->num_bytes
2263 - 1, GFP_NOFS);
2264
2265 mutex_lock(&extent_root->fs_info->pinned_mutex);
2266 ret = pin_down_bytes(trans, extent_root,
2267 extent_op->bytenr,
2268 extent_op->num_bytes, 0);
2269 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2270
2271 spin_lock(&info->delalloc_lock);
2272 used = btrfs_super_bytes_used(&info->super_copy);
2273 btrfs_set_super_bytes_used(&info->super_copy,
2274 used - extent_op->num_bytes);
2275 used = btrfs_root_used(&extent_root->root_item);
2276 btrfs_set_root_used(&extent_root->root_item,
2277 used - extent_op->num_bytes);
2278 spin_unlock(&info->delalloc_lock);
2279
2280 ret = update_block_group(trans, extent_root,
2281 extent_op->bytenr,
2282 extent_op->num_bytes,
2283 0, ret > 0);
2284 BUG_ON(ret);
2285 kfree(extent_op);
2286 num_inserts--;
2287 }
2288 }
2289 mutex_unlock(&info->extent_ins_mutex);
2290
2291 ret = insert_extents(trans, extent_root, path, &insert_list,
2292 num_inserts);
2293 BUG_ON(ret);
2294
2295 /*
2296 * if we broke out of the loop in order to insert stuff because we hit
2297 * the maximum number of inserts at a time we can handle, then loop
2298 * back and pick up where we left off
2299 */
2300 if (num_inserts == max_inserts) {
2301 INIT_LIST_HEAD(&insert_list);
2302 INIT_LIST_HEAD(&update_list);
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */
2312 if (all && skipped) {
2313 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list);
2315 search = 0;
2316 skipped = 0;
2317 num_inserts = 0;
2318 goto again;
2319 }
2320out:
2321 btrfs_free_path(path);
2322 return 0;
2323}
2324
2325static int pin_down_bytes(struct btrfs_trans_handle *trans,
2326 struct btrfs_root *root,
2327 u64 bytenr, u64 num_bytes, int is_data)
2328{
2329 int err = 0;
2330 struct extent_buffer *buf;
2331
2332 if (is_data)
2333 goto pinit;
2334
2335 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2336 if (!buf)
2337 goto pinit;
2338
2339 /* we can reuse a block if it hasn't been written
2340 * and it is from this transaction. We can't
2341 * reuse anything from the tree log root because
2342 * it has tiny sub-transactions.
2343 */
2344 if (btrfs_buffer_uptodate(buf, 0) &&
2345 btrfs_try_tree_lock(buf)) {
2346 u64 header_owner = btrfs_header_owner(buf);
2347 u64 header_transid = btrfs_header_generation(buf);
2348 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2349 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2350 header_transid == trans->transid &&
2351 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2352 clean_tree_block(NULL, root, buf);
2353 btrfs_tree_unlock(buf);
2354 free_extent_buffer(buf);
2355 return 1;
2356 }
2357 btrfs_tree_unlock(buf);
2358 }
2359 free_extent_buffer(buf);
2360pinit:
2361 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2362
2363 BUG_ON(err < 0);
2364 return 0;
2365}
2366
2367/*
2368 * remove an extent from the root, returns 0 on success
2369 */
2370static int __free_extent(struct btrfs_trans_handle *trans,
2371 struct btrfs_root *root,
2372 u64 bytenr, u64 num_bytes, u64 parent,
2373 u64 root_objectid, u64 ref_generation,
2374 u64 owner_objectid, int pin, int mark_free)
2375{
2376 struct btrfs_path *path;
2377 struct btrfs_key key;
2378 struct btrfs_fs_info *info = root->fs_info;
2379 struct btrfs_root *extent_root = info->extent_root;
2380 struct extent_buffer *leaf;
2381 int ret;
2382 int extent_slot = 0;
2383 int found_extent = 0;
2384 int num_to_del = 1;
2385 struct btrfs_extent_item *ei;
2386 u32 refs;
2387
2388 key.objectid = bytenr;
2389 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2390 key.offset = num_bytes;
2391 path = btrfs_alloc_path();
2392 if (!path)
2393 return -ENOMEM;
2394
2395 path->reada = 1;
2396 ret = lookup_extent_backref(trans, extent_root, path,
2397 bytenr, parent, root_objectid,
2398 ref_generation, owner_objectid, 1);
2399 if (ret == 0) {
2400 struct btrfs_key found_key;
2401 extent_slot = path->slots[0];
2402 while (extent_slot > 0) {
2403 extent_slot--;
2404 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2405 extent_slot);
2406 if (found_key.objectid != bytenr)
2407 break;
2408 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2409 found_key.offset == num_bytes) {
2410 found_extent = 1;
2411 break;
2412 }
2413 if (path->slots[0] - extent_slot > 5)
2414 break;
2415 }
2416 if (!found_extent) {
2417 ret = remove_extent_backref(trans, extent_root, path);
2418 BUG_ON(ret);
2419 btrfs_release_path(extent_root, path);
2420 ret = btrfs_search_slot(trans, extent_root,
2421 &key, path, -1, 1);
2422 if (ret) {
2423 printk(KERN_ERR "umm, got %d back from search"
2424 ", was looking for %llu\n", ret,
2425 (unsigned long long)bytenr);
2426 btrfs_print_leaf(extent_root, path->nodes[0]);
2427 }
2428 BUG_ON(ret);
2429 extent_slot = path->slots[0];
2430 }
2431 } else {
2432 btrfs_print_leaf(extent_root, path->nodes[0]);
2433 WARN_ON(1);
2434 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2435 "root %llu gen %llu owner %llu\n",
2436 (unsigned long long)bytenr,
2437 (unsigned long long)root_objectid,
2438 (unsigned long long)ref_generation,
2439 (unsigned long long)owner_objectid);
2440 }
2441
2442 leaf = path->nodes[0];
2443 ei = btrfs_item_ptr(leaf, extent_slot,
2444 struct btrfs_extent_item);
2445 refs = btrfs_extent_refs(leaf, ei);
2446 BUG_ON(refs == 0);
2447 refs -= 1;
2448 btrfs_set_extent_refs(leaf, ei, refs);
2449
2450 btrfs_mark_buffer_dirty(leaf);
2451
2452 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2453 struct btrfs_extent_ref *ref;
2454 ref = btrfs_item_ptr(leaf, path->slots[0],
2455 struct btrfs_extent_ref);
2456 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2457 /* if the back ref and the extent are next to each other
2458 * they get deleted below in one shot
2459 */
2460 path->slots[0] = extent_slot;
2461 num_to_del = 2;
2462 } else if (found_extent) {
2463 /* otherwise delete the extent back ref */
2464 ret = remove_extent_backref(trans, extent_root, path);
2465 BUG_ON(ret);
2466 /* if refs are 0, we need to setup the path for deletion */
2467 if (refs == 0) {
2468 btrfs_release_path(extent_root, path);
2469 ret = btrfs_search_slot(trans, extent_root, &key, path,
2470 -1, 1);
2471 BUG_ON(ret);
2472 }
2473 }
2474
2475 if (refs == 0) {
2476 u64 super_used;
2477 u64 root_used;
2478
2479 if (pin) {
2480 mutex_lock(&root->fs_info->pinned_mutex);
2481 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2482 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2483 mutex_unlock(&root->fs_info->pinned_mutex);
2484 if (ret > 0)
2485 mark_free = 1;
2486 BUG_ON(ret < 0);
2487 }
2488 /* block accounting for super block */
2489 spin_lock(&info->delalloc_lock);
2490 super_used = btrfs_super_bytes_used(&info->super_copy);
2491 btrfs_set_super_bytes_used(&info->super_copy,
2492 super_used - num_bytes);
2493
2494 /* block accounting for root item */
2495 root_used = btrfs_root_used(&root->root_item);
2496 btrfs_set_root_used(&root->root_item,
2497 root_used - num_bytes);
2498 spin_unlock(&info->delalloc_lock);
2499 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2500 num_to_del);
2501 BUG_ON(ret);
2502 btrfs_release_path(extent_root, path);
2503
2504 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2505 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2506 BUG_ON(ret);
2507 }
2508
2509 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2510 mark_free);
2511 BUG_ON(ret);
2512 }
2513 btrfs_free_path(path);
2514 finish_current_insert(trans, extent_root, 0);
2515 return ret;
2516}
2517
2518/*
2519 * find all the blocks marked as pending in the radix tree and remove
2520 * them from the extent map
2521 */
2522static int del_pending_extents(struct btrfs_trans_handle *trans,
2523 struct btrfs_root *extent_root, int all)
2524{
2525 int ret;
2526 int err = 0;
2527 u64 start;
2528 u64 end;
2529 u64 priv;
2530 u64 search = 0;
2531 int nr = 0, skipped = 0;
2532 struct extent_io_tree *pending_del;
2533 struct extent_io_tree *extent_ins;
2534 struct pending_extent_op *extent_op;
2535 struct btrfs_fs_info *info = extent_root->fs_info;
2536 struct list_head delete_list;
2537
2538 INIT_LIST_HEAD(&delete_list);
2539 extent_ins = &extent_root->fs_info->extent_ins;
2540 pending_del = &extent_root->fs_info->pending_del;
2541
2542again:
2543 mutex_lock(&info->extent_ins_mutex);
2544 while (1) {
2545 ret = find_first_extent_bit(pending_del, search, &start, &end,
2546 EXTENT_WRITEBACK);
2547 if (ret) {
2548 if (all && skipped && !nr) {
2549 search = 0;
2550 continue;
2551 }
2552 mutex_unlock(&info->extent_ins_mutex);
2553 break;
2554 }
2555
2556 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2557 if (!ret) {
2558 search = end+1;
2559 skipped = 1;
2560
2561 if (need_resched()) {
2562 mutex_unlock(&info->extent_ins_mutex);
2563 cond_resched();
2564 mutex_lock(&info->extent_ins_mutex);
2565 }
2566
2567 continue;
2568 }
2569 BUG_ON(ret < 0);
2570
2571 ret = get_state_private(pending_del, start, &priv);
2572 BUG_ON(ret);
2573 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2574
2575 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2576 GFP_NOFS);
2577 if (!test_range_bit(extent_ins, start, end,
2578 EXTENT_WRITEBACK, 0)) {
2579 list_add_tail(&extent_op->list, &delete_list);
2580 nr++;
2581 } else {
2582 kfree(extent_op);
2583
2584 ret = get_state_private(&info->extent_ins, start,
2585 &priv);
2586 BUG_ON(ret);
2587 extent_op = (struct pending_extent_op *)
2588 (unsigned long)priv;
2589
2590 clear_extent_bits(&info->extent_ins, start, end,
2591 EXTENT_WRITEBACK, GFP_NOFS);
2592
2593 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2594 list_add_tail(&extent_op->list, &delete_list);
2595 search = end + 1;
2596 nr++;
2597 continue;
2598 }
2599
2600 mutex_lock(&extent_root->fs_info->pinned_mutex);
2601 ret = pin_down_bytes(trans, extent_root, start,
2602 end + 1 - start, 0);
2603 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2604
2605 ret = update_block_group(trans, extent_root, start,
2606 end + 1 - start, 0, ret > 0);
2607
2608 unlock_extent(extent_ins, start, end, GFP_NOFS);
2609 BUG_ON(ret);
2610 kfree(extent_op);
2611 }
2612 if (ret)
2613 err = ret;
2614
2615 search = end + 1;
2616
2617 if (need_resched()) {
2618 mutex_unlock(&info->extent_ins_mutex);
2619 cond_resched();
2620 mutex_lock(&info->extent_ins_mutex);
2621 }
2622 }
2623
2624 if (nr) {
2625 ret = free_extents(trans, extent_root, &delete_list);
2626 BUG_ON(ret);
2627 }
2628
2629 if (all && skipped) {
2630 INIT_LIST_HEAD(&delete_list);
2631 search = 0;
2632 nr = 0;
2633 goto again;
2634 }
2635
2636 return err;
2637}
2638
2639/*
2640 * remove an extent from the root, returns 0 on success
2641 */
2642static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2643 struct btrfs_root *root,
2644 u64 bytenr, u64 num_bytes, u64 parent,
2645 u64 root_objectid, u64 ref_generation,
2646 u64 owner_objectid, int pin)
2647{
2648 struct btrfs_root *extent_root = root->fs_info->extent_root;
2649 int pending_ret;
2650 int ret;
2651
2652 WARN_ON(num_bytes < root->sectorsize);
2653 if (root == extent_root) {
2654 struct pending_extent_op *extent_op = NULL;
2655
2656 mutex_lock(&root->fs_info->extent_ins_mutex);
2657 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2658 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2659 u64 priv;
2660 ret = get_state_private(&root->fs_info->extent_ins,
2661 bytenr, &priv);
2662 BUG_ON(ret);
2663 extent_op = (struct pending_extent_op *)
2664 (unsigned long)priv;
2665
2666 extent_op->del = 1;
2667 if (extent_op->type == PENDING_EXTENT_INSERT) {
2668 mutex_unlock(&root->fs_info->extent_ins_mutex);
2669 return 0;
2670 }
2671 }
2672
2673 if (extent_op) {
2674 ref_generation = extent_op->orig_generation;
2675 parent = extent_op->orig_parent;
2676 }
2677
2678 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2679 BUG_ON(!extent_op);
2680
2681 extent_op->type = PENDING_EXTENT_DELETE;
2682 extent_op->bytenr = bytenr;
2683 extent_op->num_bytes = num_bytes;
2684 extent_op->parent = parent;
2685 extent_op->orig_parent = parent;
2686 extent_op->generation = ref_generation;
2687 extent_op->orig_generation = ref_generation;
2688 extent_op->level = (int)owner_objectid;
2689 INIT_LIST_HEAD(&extent_op->list);
2690 extent_op->del = 0;
2691
2692 set_extent_bits(&root->fs_info->pending_del,
2693 bytenr, bytenr + num_bytes - 1,
2694 EXTENT_WRITEBACK, GFP_NOFS);
2695 set_state_private(&root->fs_info->pending_del,
2696 bytenr, (unsigned long)extent_op);
2697 mutex_unlock(&root->fs_info->extent_ins_mutex);
2698 return 0;
2699 }
2700 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache;
2704
2705 /* btrfs_free_reserved_extent */
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0;
2712 }
2713 pin = 1;
2714 }
2715
2716 /* if data pin when any transaction has committed this */
2717 if (ref_generation != trans->transid)
2718 pin = 1;
2719
2720 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2721 root_objectid, ref_generation,
2722 owner_objectid, pin, pin == 0);
2723
2724 finish_current_insert(trans, root->fs_info->extent_root, 0);
2725 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2726 return ret ? ret : pending_ret;
2727}
2728
2729int btrfs_free_extent(struct btrfs_trans_handle *trans,
2730 struct btrfs_root *root,
2731 u64 bytenr, u64 num_bytes, u64 parent,
2732 u64 root_objectid, u64 ref_generation,
2733 u64 owner_objectid, int pin)
2734{
2735 int ret;
2736
2737 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2738 root_objectid, ref_generation,
2739 owner_objectid, pin);
2740 return ret;
2741}
2742
2743static u64 stripe_align(struct btrfs_root *root, u64 val)
2744{
2745 u64 mask = ((u64)root->stripesize - 1);
2746 u64 ret = (val + mask) & ~mask;
2747 return ret;
2748}
2749
2750/*
2751 * walks the btree of allocated extents and find a hole of a given size.
2752 * The key ins is changed to record the hole:
2753 * ins->objectid == block start
2754 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2755 * ins->offset == number of blocks
2756 * Any available blocks before search_start are skipped.
2757 */
2758static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2759 struct btrfs_root *orig_root,
2760 u64 num_bytes, u64 empty_size,
2761 u64 search_start, u64 search_end,
2762 u64 hint_byte, struct btrfs_key *ins,
2763 u64 exclude_start, u64 exclude_nr,
2764 int data)
2765{
2766 int ret = 0;
2767 struct btrfs_root *root = orig_root->fs_info->extent_root;
2768 u64 total_needed = num_bytes;
2769 u64 *last_ptr = NULL;
2770 u64 last_wanted = 0;
2771 struct btrfs_block_group_cache *block_group = NULL;
2772 int chunk_alloc_done = 0;
2773 int empty_cluster = 2 * 1024 * 1024;
2774 int allowed_chunk_alloc = 0;
2775 struct list_head *head = NULL, *cur = NULL;
2776 int loop = 0;
2777 int extra_loop = 0;
2778 struct btrfs_space_info *space_info;
2779
2780 WARN_ON(num_bytes < root->sectorsize);
2781 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2782 ins->objectid = 0;
2783 ins->offset = 0;
2784
2785 if (orig_root->ref_cows || empty_size)
2786 allowed_chunk_alloc = 1;
2787
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024;
2791 }
2792
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2794 last_ptr = &root->fs_info->last_data_alloc;
2795
2796 if (last_ptr) {
2797 if (*last_ptr) {
2798 hint_byte = *last_ptr;
2799 last_wanted = *last_ptr;
2800 } else
2801 empty_size += empty_cluster;
2802 } else {
2803 empty_cluster = 0;
2804 }
2805 search_start = max(search_start, first_logical_byte(root, 0));
2806 search_start = max(search_start, hint_byte);
2807
2808 if (last_wanted && search_start != last_wanted) {
2809 last_wanted = 0;
2810 empty_size += empty_cluster;
2811 }
2812
2813 total_needed += empty_size;
2814 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2815 if (!block_group)
2816 block_group = btrfs_lookup_first_block_group(root->fs_info,
2817 search_start);
2818 space_info = __find_space_info(root->fs_info, data);
2819
2820 down_read(&space_info->groups_sem);
2821 while (1) {
2822 struct btrfs_free_space *free_space;
2823 /*
2824 * the only way this happens if our hint points to a block
2825 * group thats not of the proper type, while looping this
2826 * should never happen
2827 */
2828 if (empty_size)
2829 extra_loop = 1;
2830
2831 if (!block_group)
2832 goto new_group_no_lock;
2833
2834 if (unlikely(!block_group->cached)) {
2835 mutex_lock(&block_group->cache_mutex);
2836 ret = cache_block_group(root, block_group);
2837 mutex_unlock(&block_group->cache_mutex);
2838 if (ret)
2839 break;
2840 }
2841
2842 mutex_lock(&block_group->alloc_mutex);
2843 if (unlikely(!block_group_bits(block_group, data)))
2844 goto new_group;
2845
2846 if (unlikely(block_group->ro))
2847 goto new_group;
2848
2849 free_space = btrfs_find_free_space(block_group, search_start,
2850 total_needed);
2851 if (free_space) {
2852 u64 start = block_group->key.objectid;
2853 u64 end = block_group->key.objectid +
2854 block_group->key.offset;
2855
2856 search_start = stripe_align(root, free_space->offset);
2857
2858 /* move on to the next group */
2859 if (search_start + num_bytes >= search_end)
2860 goto new_group;
2861
2862 /* move on to the next group */
2863 if (search_start + num_bytes > end)
2864 goto new_group;
2865
2866 if (last_wanted && search_start != last_wanted) {
2867 total_needed += empty_cluster;
2868 empty_size += empty_cluster;
2869 last_wanted = 0;
2870 /*
2871 * if search_start is still in this block group
2872 * then we just re-search this block group
2873 */
2874 if (search_start >= start &&
2875 search_start < end) {
2876 mutex_unlock(&block_group->alloc_mutex);
2877 continue;
2878 }
2879
2880 /* else we go to the next block group */
2881 goto new_group;
2882 }
2883
2884 if (exclude_nr > 0 &&
2885 (search_start + num_bytes > exclude_start &&
2886 search_start < exclude_start + exclude_nr)) {
2887 search_start = exclude_start + exclude_nr;
2888 /*
2889 * if search_start is still in this block group
2890 * then we just re-search this block group
2891 */
2892 if (search_start >= start &&
2893 search_start < end) {
2894 mutex_unlock(&block_group->alloc_mutex);
2895 last_wanted = 0;
2896 continue;
2897 }
2898
2899 /* else we go to the next block group */
2900 goto new_group;
2901 }
2902
2903 ins->objectid = search_start;
2904 ins->offset = num_bytes;
2905
2906 btrfs_remove_free_space_lock(block_group, search_start,
2907 num_bytes);
2908 /* we are all good, lets return */
2909 mutex_unlock(&block_group->alloc_mutex);
2910 break;
2911 }
2912new_group:
2913 mutex_unlock(&block_group->alloc_mutex);
2914 put_block_group(block_group);
2915 block_group = NULL;
2916new_group_no_lock:
2917 /* don't try to compare new allocations against the
2918 * last allocation any more
2919 */
2920 last_wanted = 0;
2921
2922 /*
2923 * Here's how this works.
2924 * loop == 0: we were searching a block group via a hint
2925 * and didn't find anything, so we start at
2926 * the head of the block groups and keep searching
2927 * loop == 1: we're searching through all of the block groups
2928 * if we hit the head again we have searched
2929 * all of the block groups for this space and we
2930 * need to try and allocate, if we cant error out.
2931 * loop == 2: we allocated more space and are looping through
2932 * all of the block groups again.
2933 */
2934 if (loop == 0) {
2935 head = &space_info->block_groups;
2936 cur = head->next;
2937 loop++;
2938 } else if (loop == 1 && cur == head) {
2939 int keep_going;
2940
2941 /* at this point we give up on the empty_size
2942 * allocations and just try to allocate the min
2943 * space.
2944 *
2945 * The extra_loop field was set if an empty_size
2946 * allocation was attempted above, and if this
2947 * is try we need to try the loop again without
2948 * the additional empty_size.
2949 */
2950 total_needed -= empty_size;
2951 empty_size = 0;
2952 keep_going = extra_loop;
2953 loop++;
2954
2955 if (allowed_chunk_alloc && !chunk_alloc_done) {
2956 up_read(&space_info->groups_sem);
2957 ret = do_chunk_alloc(trans, root, num_bytes +
2958 2 * 1024 * 1024, data, 1);
2959 down_read(&space_info->groups_sem);
2960 if (ret < 0)
2961 goto loop_check;
2962 head = &space_info->block_groups;
2963 /*
2964 * we've allocated a new chunk, keep
2965 * trying
2966 */
2967 keep_going = 1;
2968 chunk_alloc_done = 1;
2969 } else if (!allowed_chunk_alloc) {
2970 space_info->force_alloc = 1;
2971 }
2972loop_check:
2973 if (keep_going) {
2974 cur = head->next;
2975 extra_loop = 0;
2976 } else {
2977 break;
2978 }
2979 } else if (cur == head) {
2980 break;
2981 }
2982
2983 block_group = list_entry(cur, struct btrfs_block_group_cache,
2984 list);
2985 atomic_inc(&block_group->count);
2986
2987 search_start = block_group->key.objectid;
2988 cur = cur->next;
2989 }
2990
2991 /* we found what we needed */
2992 if (ins->objectid) {
2993 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2994 trans->block_group = block_group->key.objectid;
2995
2996 if (last_ptr)
2997 *last_ptr = ins->objectid + ins->offset;
2998 ret = 0;
2999 } else if (!ret) {
3000 printk(KERN_ERR "btrfs searching for %llu bytes, "
3001 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3002 (unsigned long long)total_needed,
3003 (unsigned long long)num_bytes,
3004 loop, allowed_chunk_alloc);
3005 ret = -ENOSPC;
3006 }
3007 if (block_group)
3008 put_block_group(block_group);
3009
3010 up_read(&space_info->groups_sem);
3011 return ret;
3012}
3013
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{
3016 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not ");
3023
3024 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n",
3030 (unsigned long long)cache->key.objectid,
3031 (unsigned long long)cache->key.offset,
3032 (unsigned long long)btrfs_block_group_used(&cache->item),
3033 (unsigned long long)cache->pinned,
3034 (unsigned long long)cache->reserved);
3035 btrfs_dump_free_space(cache, bytes);
3036 spin_unlock(&cache->lock);
3037 }
3038 up_read(&info->groups_sem);
3039}
3040
3041static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3042 struct btrfs_root *root,
3043 u64 num_bytes, u64 min_alloc_size,
3044 u64 empty_size, u64 hint_byte,
3045 u64 search_end, struct btrfs_key *ins,
3046 u64 data)
3047{
3048 int ret;
3049 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info;
3052
3053 if (data) {
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations
3071 */
3072 if (empty_size || root->ref_cows) {
3073 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3074 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3075 2 * 1024 * 1024,
3076 BTRFS_BLOCK_GROUP_METADATA |
3077 (info->metadata_alloc_profile &
3078 info->avail_metadata_alloc_bits), 0);
3079 }
3080 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3081 num_bytes + 2 * 1024 * 1024, data, 0);
3082 }
3083
3084 WARN_ON(num_bytes < root->sectorsize);
3085 ret = find_free_extent(trans, root, num_bytes, empty_size,
3086 search_start, search_end, hint_byte, ins,
3087 trans->alloc_exclude_start,
3088 trans->alloc_exclude_nr, data);
3089
3090 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3091 num_bytes = num_bytes >> 1;
3092 num_bytes = num_bytes & ~(root->sectorsize - 1);
3093 num_bytes = max(num_bytes, min_alloc_size);
3094 do_chunk_alloc(trans, root->fs_info->extent_root,
3095 num_bytes, data, 1);
3096 goto again;
3097 }
3098 if (ret) {
3099 struct btrfs_space_info *sinfo;
3100
3101 sinfo = __find_space_info(root->fs_info, data);
3102 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3103 "wanted %llu\n", (unsigned long long)data,
3104 (unsigned long long)num_bytes);
3105 dump_space_info(sinfo, num_bytes);
3106 BUG();
3107 }
3108
3109 return ret;
3110}
3111
3112int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3113{
3114 struct btrfs_block_group_cache *cache;
3115 int ret = 0;
3116
3117 cache = btrfs_lookup_block_group(root->fs_info, start);
3118 if (!cache) {
3119 printk(KERN_ERR "Unable to find block group for %llu\n",
3120 (unsigned long long)start);
3121 return -ENOSPC;
3122 }
3123
3124 ret = btrfs_discard_extent(root, start, len);
3125
3126 btrfs_add_free_space(cache, start, len);
3127 put_block_group(cache);
3128 update_reserved_extents(root, start, len, 0);
3129
3130 return ret;
3131}
3132
3133int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3134 struct btrfs_root *root,
3135 u64 num_bytes, u64 min_alloc_size,
3136 u64 empty_size, u64 hint_byte,
3137 u64 search_end, struct btrfs_key *ins,
3138 u64 data)
3139{
3140 int ret;
3141 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3142 empty_size, hint_byte, search_end, ins,
3143 data);
3144 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3145 return ret;
3146}
3147
3148static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3149 struct btrfs_root *root, u64 parent,
3150 u64 root_objectid, u64 ref_generation,
3151 u64 owner, struct btrfs_key *ins)
3152{
3153 int ret;
3154 int pending_ret;
3155 u64 super_used;
3156 u64 root_used;
3157 u64 num_bytes = ins->offset;
3158 u32 sizes[2];
3159 struct btrfs_fs_info *info = root->fs_info;
3160 struct btrfs_root *extent_root = info->extent_root;
3161 struct btrfs_extent_item *extent_item;
3162 struct btrfs_extent_ref *ref;
3163 struct btrfs_path *path;
3164 struct btrfs_key keys[2];
3165
3166 if (parent == 0)
3167 parent = ins->objectid;
3168
3169 /* block accounting for super block */
3170 spin_lock(&info->delalloc_lock);
3171 super_used = btrfs_super_bytes_used(&info->super_copy);
3172 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3173
3174 /* block accounting for root item */
3175 root_used = btrfs_root_used(&root->root_item);
3176 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3177 spin_unlock(&info->delalloc_lock);
3178
3179 if (root == extent_root) {
3180 struct pending_extent_op *extent_op;
3181
3182 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3183 BUG_ON(!extent_op);
3184
3185 extent_op->type = PENDING_EXTENT_INSERT;
3186 extent_op->bytenr = ins->objectid;
3187 extent_op->num_bytes = ins->offset;
3188 extent_op->parent = parent;
3189 extent_op->orig_parent = 0;
3190 extent_op->generation = ref_generation;
3191 extent_op->orig_generation = 0;
3192 extent_op->level = (int)owner;
3193 INIT_LIST_HEAD(&extent_op->list);
3194 extent_op->del = 0;
3195
3196 mutex_lock(&root->fs_info->extent_ins_mutex);
3197 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3198 ins->objectid + ins->offset - 1,
3199 EXTENT_WRITEBACK, GFP_NOFS);
3200 set_state_private(&root->fs_info->extent_ins,
3201 ins->objectid, (unsigned long)extent_op);
3202 mutex_unlock(&root->fs_info->extent_ins_mutex);
3203 goto update_block;
3204 }
3205
3206 memcpy(&keys[0], ins, sizeof(*ins));
3207 keys[1].objectid = ins->objectid;
3208 keys[1].type = BTRFS_EXTENT_REF_KEY;
3209 keys[1].offset = parent;
3210 sizes[0] = sizeof(*extent_item);
3211 sizes[1] = sizeof(*ref);
3212
3213 path = btrfs_alloc_path();
3214 BUG_ON(!path);
3215
3216 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3217 sizes, 2);
3218 BUG_ON(ret);
3219
3220 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3221 struct btrfs_extent_item);
3222 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3223 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3224 struct btrfs_extent_ref);
3225
3226 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3227 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3228 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3229 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3230
3231 btrfs_mark_buffer_dirty(path->nodes[0]);
3232
3233 trans->alloc_exclude_start = 0;
3234 trans->alloc_exclude_nr = 0;
3235 btrfs_free_path(path);
3236 finish_current_insert(trans, extent_root, 0);
3237 pending_ret = del_pending_extents(trans, extent_root, 0);
3238
3239 if (ret)
3240 goto out;
3241 if (pending_ret) {
3242 ret = pending_ret;
3243 goto out;
3244 }
3245
3246update_block:
3247 ret = update_block_group(trans, root, ins->objectid,
3248 ins->offset, 1, 0);
3249 if (ret) {
3250 printk(KERN_ERR "btrfs update block group failed for %llu "
3251 "%llu\n", (unsigned long long)ins->objectid,
3252 (unsigned long long)ins->offset);
3253 BUG();
3254 }
3255out:
3256 return ret;
3257}
3258
3259int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, u64 parent,
3261 u64 root_objectid, u64 ref_generation,
3262 u64 owner, struct btrfs_key *ins)
3263{
3264 int ret;
3265
3266 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3267 return 0;
3268 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3269 ref_generation, owner, ins);
3270 update_reserved_extents(root, ins->objectid, ins->offset, 0);
3271 return ret;
3272}
3273
3274/*
3275 * this is used by the tree logging recovery code. It records that
3276 * an extent has been allocated and makes sure to clear the free
3277 * space cache bits as well
3278 */
3279int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root, u64 parent,
3281 u64 root_objectid, u64 ref_generation,
3282 u64 owner, struct btrfs_key *ins)
3283{
3284 int ret;
3285 struct btrfs_block_group_cache *block_group;
3286
3287 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3288 mutex_lock(&block_group->cache_mutex);
3289 cache_block_group(root, block_group);
3290 mutex_unlock(&block_group->cache_mutex);
3291
3292 ret = btrfs_remove_free_space(block_group, ins->objectid,
3293 ins->offset);
3294 BUG_ON(ret);
3295 put_block_group(block_group);
3296 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3297 ref_generation, owner, ins);
3298 return ret;
3299}
3300
3301/*
3302 * finds a free extent and does all the dirty work required for allocation
3303 * returns the key for the extent through ins, and a tree buffer for
3304 * the first block of the extent through buf.
3305 *
3306 * returns 0 if everything worked, non-zero otherwise.
3307 */
3308int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root,
3310 u64 num_bytes, u64 parent, u64 min_alloc_size,
3311 u64 root_objectid, u64 ref_generation,
3312 u64 owner_objectid, u64 empty_size, u64 hint_byte,
3313 u64 search_end, struct btrfs_key *ins, u64 data)
3314{
3315 int ret;
3316
3317 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3318 min_alloc_size, empty_size, hint_byte,
3319 search_end, ins, data);
3320 BUG_ON(ret);
3321 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3322 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3323 root_objectid, ref_generation,
3324 owner_objectid, ins);
3325 BUG_ON(ret);
3326
3327 } else {
3328 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3329 }
3330 return ret;
3331}
3332
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize)
3336{
3337 struct extent_buffer *buf;
3338
3339 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3340 if (!buf)
3341 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid);
3343 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf);
3345 btrfs_set_buffer_uptodate(buf);
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS);
3349 } else {
3350 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3351 buf->start + buf->len - 1, GFP_NOFS);
3352 }
3353 trans->blocks_used++;
3354 return buf;
3355}
3356
3357/*
3358 * helper function to allocate a block for a given tree
3359 * returns the tree buffer or NULL.
3360 */
3361struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3362 struct btrfs_root *root,
3363 u32 blocksize, u64 parent,
3364 u64 root_objectid,
3365 u64 ref_generation,
3366 int level,
3367 u64 hint,
3368 u64 empty_size)
3369{
3370 struct btrfs_key ins;
3371 int ret;
3372 struct extent_buffer *buf;
3373
3374 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3375 root_objectid, ref_generation, level,
3376 empty_size, hint, (u64)-1, &ins, 0);
3377 if (ret) {
3378 BUG_ON(ret > 0);
3379 return ERR_PTR(ret);
3380 }
3381
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3383 return buf;
3384}
3385
3386int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3387 struct btrfs_root *root, struct extent_buffer *leaf)
3388{
3389 u64 leaf_owner;
3390 u64 leaf_generation;
3391 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi;
3393 int i;
3394 int nritems;
3395 int ret;
3396
3397 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf);
3401
3402 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr;
3404 cond_resched();
3405
3406 btrfs_item_key_to_cpu(leaf, &key, i);
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue;
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3410 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE)
3412 continue;
3413 /*
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3418 if (disk_bytenr == 0)
3419 continue;
3420
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation,
3424 key.objectid, 0);
3425 BUG_ON(ret);
3426
3427 atomic_inc(&root->fs_info->throttle_gen);
3428 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched();
3430 }
3431 return 0;
3432}
3433
3434static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3435 struct btrfs_root *root,
3436 struct btrfs_leaf_ref *ref)
3437{
3438 int i;
3439 int ret;
3440 struct btrfs_extent_info *info = ref->extents;
3441
3442 for (i = 0; i < ref->nritems; i++) {
3443 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation,
3446 info->objectid, 0);
3447
3448 atomic_inc(&root->fs_info->throttle_gen);
3449 wake_up(&root->fs_info->transaction_throttle);
3450 cond_resched();
3451
3452 BUG_ON(ret);
3453 info++;
3454 }
3455
3456 return 0;
3457}
3458
3459static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3460 u64 len, u32 *refs)
3461{
3462 int ret;
3463
3464 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3465 BUG_ON(ret);
3466
3467#if 0 /* some debugging code in case we see problems here */
3468 /* if the refs count is one, it won't get increased again. But
3469 * if the ref count is > 1, someone may be decreasing it at
3470 * the same time we are.
3471 */
3472 if (*refs != 1) {
3473 struct extent_buffer *eb = NULL;
3474 eb = btrfs_find_create_tree_block(root, start, len);
3475 if (eb)
3476 btrfs_tree_lock(eb);
3477
3478 mutex_lock(&root->fs_info->alloc_mutex);
3479 ret = lookup_extent_ref(NULL, root, start, len, refs);
3480 BUG_ON(ret);
3481 mutex_unlock(&root->fs_info->alloc_mutex);
3482
3483 if (eb) {
3484 btrfs_tree_unlock(eb);
3485 free_extent_buffer(eb);
3486 }
3487 if (*refs == 1) {
3488 printk(KERN_ERR "btrfs block %llu went down to one "
3489 "during drop_snap\n", (unsigned long long)start);
3490 }
3491
3492 }
3493#endif
3494
3495 cond_resched();
3496 return ret;
3497}
3498
3499/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes.
3502 */
3503static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *root,
3505 struct btrfs_path *path, int *level)
3506{
3507 u64 root_owner;
3508 u64 root_gen;
3509 u64 bytenr;
3510 u64 ptr_gen;
3511 struct extent_buffer *next;
3512 struct extent_buffer *cur;
3513 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize;
3516 int ret;
3517 u32 refs;
3518
3519 WARN_ON(*level < 0);
3520 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3521 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3522 path->nodes[*level]->len, &refs);
3523 BUG_ON(ret);
3524 if (refs > 1)
3525 goto out;
3526
3527 /*
3528 * walk down to the last node level and free all the leaves
3529 */
3530 while (*level >= 0) {
3531 WARN_ON(*level < 0);
3532 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3533 cur = path->nodes[*level];
3534
3535 if (btrfs_header_level(cur) != *level)
3536 WARN_ON(1);
3537
3538 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur))
3540 break;
3541 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret);
3544 break;
3545 }
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1);
3549
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret);
3552 if (refs != 1) {
3553 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent);
3555 root_gen = btrfs_header_generation(parent);
3556 path->slots[*level]++;
3557
3558 ret = __btrfs_free_extent(trans, root, bytenr,
3559 blocksize, parent->start,
3560 root_owner, root_gen,
3561 *level - 1, 1);
3562 BUG_ON(ret);
3563
3564 atomic_inc(&root->fs_info->throttle_gen);
3565 wake_up(&root->fs_info->transaction_throttle);
3566 cond_resched();
3567
3568 continue;
3569 }
3570 /*
3571 * at this point, we have a single ref, and since the
3572 * only place referencing this extent is a dead root
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */
3576 if (*level == 1) {
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]);
3613 path->nodes[*level-1] = next;
3614 *level = btrfs_header_level(next);
3615 path->slots[*level] = 0;
3616 cond_resched();
3617 }
3618out:
3619 WARN_ON(*level < 0);
3620 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3621
3622 if (path->nodes[*level] == root->node) {
3623 parent = path->nodes[*level];
3624 bytenr = path->nodes[*level]->start;
3625 } else {
3626 parent = path->nodes[*level + 1];
3627 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3628 }
3629
3630 blocksize = btrfs_level_size(root, *level);
3631 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent);
3633
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen,
3636 *level, 1);
3637 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL;
3639 *level += 1;
3640 BUG_ON(ret);
3641
3642 cond_resched();
3643 return 0;
3644}
3645
3646/*
3647 * helper function for drop_subtree, this function is similar to
3648 * walk_down_tree. The main difference is that it checks reference
3649 * counts while tree blocks are locked.
3650 */
3651static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3652 struct btrfs_root *root,
3653 struct btrfs_path *path, int *level)
3654{
3655 struct extent_buffer *next;
3656 struct extent_buffer *cur;
3657 struct extent_buffer *parent;
3658 u64 bytenr;
3659 u64 ptr_gen;
3660 u32 blocksize;
3661 u32 refs;
3662 int ret;
3663
3664 cur = path->nodes[*level];
3665 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3666 &refs);
3667 BUG_ON(ret);
3668 if (refs > 1)
3669 goto out;
3670
3671 while (*level >= 0) {
3672 cur = path->nodes[*level];
3673 if (*level == 0) {
3674 ret = btrfs_drop_leaf_ref(trans, root, cur);
3675 BUG_ON(ret);
3676 clean_tree_block(trans, root, cur);
3677 break;
3678 }
3679 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3680 clean_tree_block(trans, root, cur);
3681 break;
3682 }
3683
3684 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3685 blocksize = btrfs_level_size(root, *level - 1);
3686 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3687
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next);
3690
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs);
3693 BUG_ON(ret);
3694 if (refs > 1) {
3695 parent = path->nodes[*level];
3696 ret = btrfs_free_extent(trans, root, bytenr,
3697 blocksize, parent->start,
3698 btrfs_header_owner(parent),
3699 btrfs_header_generation(parent),
3700 *level - 1, 1);
3701 BUG_ON(ret);
3702 path->slots[*level]++;
3703 btrfs_tree_unlock(next);
3704 free_extent_buffer(next);
3705 continue;
3706 }
3707
3708 *level = btrfs_header_level(next);
3709 path->nodes[*level] = next;
3710 path->slots[*level] = 0;
3711 path->locks[*level] = 1;
3712 cond_resched();
3713 }
3714out:
3715 parent = path->nodes[*level + 1];
3716 bytenr = path->nodes[*level]->start;
3717 blocksize = path->nodes[*level]->len;
3718
3719 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3720 parent->start, btrfs_header_owner(parent),
3721 btrfs_header_generation(parent), *level, 1);
3722 BUG_ON(ret);
3723
3724 if (path->locks[*level]) {
3725 btrfs_tree_unlock(path->nodes[*level]);
3726 path->locks[*level] = 0;
3727 }
3728 free_extent_buffer(path->nodes[*level]);
3729 path->nodes[*level] = NULL;
3730 *level += 1;
3731 cond_resched();
3732 return 0;
3733}
3734
3735/*
3736 * helper for dropping snapshots. This walks back up the tree in the path
3737 * to find the first node higher up where we haven't yet gone through
3738 * all the slots
3739 */
3740static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3741 struct btrfs_root *root,
3742 struct btrfs_path *path,
3743 int *level, int max_level)
3744{
3745 u64 root_owner;
3746 u64 root_gen;
3747 struct btrfs_root_item *root_item = &root->root_item;
3748 int i;
3749 int slot;
3750 int ret;
3751
3752 for (i = *level; i < max_level && path->nodes[i]; i++) {
3753 slot = path->slots[i];
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key;
3757 node = path->nodes[i];
3758 path->slots[i]++;
3759 *level = i;
3760 WARN_ON(*level == 0);
3761 btrfs_node_key(node, &disk_key, path->slots[i]);
3762 memcpy(&root_item->drop_progress,
3763 &disk_key, sizeof(disk_key));
3764 root_item->drop_level = i;
3765 return 0;
3766 } else {
3767 struct extent_buffer *parent;
3768 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level];
3770 else
3771 parent = path->nodes[*level + 1];
3772
3773 root_owner = btrfs_header_owner(parent);
3774 root_gen = btrfs_header_generation(parent);
3775
3776 clean_tree_block(trans, root, path->nodes[*level]);
3777 ret = btrfs_free_extent(trans, root,
3778 path->nodes[*level]->start,
3779 path->nodes[*level]->len,
3780 parent->start, root_owner,
3781 root_gen, *level, 1);
3782 BUG_ON(ret);
3783 if (path->locks[*level]) {
3784 btrfs_tree_unlock(path->nodes[*level]);
3785 path->locks[*level] = 0;
3786 }
3787 free_extent_buffer(path->nodes[*level]);
3788 path->nodes[*level] = NULL;
3789 *level = i + 1;
3790 }
3791 }
3792 return 1;
3793}
3794
3795/*
3796 * drop the reference count on the tree rooted at 'snap'. This traverses
3797 * the tree freeing any blocks that have a ref count of zero after being
3798 * decremented.
3799 */
3800int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3801 *root)
3802{
3803 int ret = 0;
3804 int wret;
3805 int level;
3806 struct btrfs_path *path;
3807 int i;
3808 int orig_level;
3809 struct btrfs_root_item *root_item = &root->root_item;
3810
3811 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3812 path = btrfs_alloc_path();
3813 BUG_ON(!path);
3814
3815 level = btrfs_header_level(root->node);
3816 orig_level = level;
3817 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3818 path->nodes[level] = root->node;
3819 extent_buffer_get(root->node);
3820 path->slots[level] = 0;
3821 } else {
3822 struct btrfs_key key;
3823 struct btrfs_disk_key found_key;
3824 struct extent_buffer *node;
3825
3826 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3827 level = root_item->drop_level;
3828 path->lowest_level = level;
3829 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3830 if (wret < 0) {
3831 ret = wret;
3832 goto out;
3833 }
3834 node = path->nodes[level];
3835 btrfs_node_key(node, &found_key, path->slots[level]);
3836 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3837 sizeof(found_key)));
3838 /*
3839 * unlock our path, this is safe because only this
3840 * function is allowed to delete this snapshot
3841 */
3842 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3843 if (path->nodes[i] && path->locks[i]) {
3844 path->locks[i] = 0;
3845 btrfs_tree_unlock(path->nodes[i]);
3846 }
3847 }
3848 }
3849 while (1) {
3850 wret = walk_down_tree(trans, root, path, &level);
3851 if (wret > 0)
3852 break;
3853 if (wret < 0)
3854 ret = wret;
3855
3856 wret = walk_up_tree(trans, root, path, &level,
3857 BTRFS_MAX_LEVEL);
3858 if (wret > 0)
3859 break;
3860 if (wret < 0)
3861 ret = wret;
3862 if (trans->transaction->in_commit) {
3863 ret = -EAGAIN;
3864 break;
3865 }
3866 atomic_inc(&root->fs_info->throttle_gen);
3867 wake_up(&root->fs_info->transaction_throttle);
3868 }
3869 for (i = 0; i <= orig_level; i++) {
3870 if (path->nodes[i]) {
3871 free_extent_buffer(path->nodes[i]);
3872 path->nodes[i] = NULL;
3873 }
3874 }
3875out:
3876 btrfs_free_path(path);
3877 return ret;
3878}
3879
3880int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3881 struct btrfs_root *root,
3882 struct extent_buffer *node,
3883 struct extent_buffer *parent)
3884{
3885 struct btrfs_path *path;
3886 int level;
3887 int parent_level;
3888 int ret = 0;
3889 int wret;
3890
3891 path = btrfs_alloc_path();
3892 BUG_ON(!path);
3893
3894 BUG_ON(!btrfs_tree_locked(parent));
3895 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent);
3899
3900 BUG_ON(!btrfs_tree_locked(node));
3901 level = btrfs_header_level(node);
3902 extent_buffer_get(node);
3903 path->nodes[level] = node;
3904 path->slots[level] = 0;
3905
3906 while (1) {
3907 wret = walk_down_subtree(trans, root, path, &level);
3908 if (wret < 0)
3909 ret = wret;
3910 if (wret != 0)
3911 break;
3912
3913 wret = walk_up_tree(trans, root, path, &level, parent_level);
3914 if (wret < 0)
3915 ret = wret;
3916 if (wret != 0)
3917 break;
3918 }
3919
3920 btrfs_free_path(path);
3921 return ret;
3922}
3923
3924static unsigned long calc_ra(unsigned long start, unsigned long last,
3925 unsigned long nr)
3926{
3927 return min(last, start + nr - 1);
3928}
3929
3930static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3931 u64 len)
3932{
3933 u64 page_start;
3934 u64 page_end;
3935 unsigned long first_index;
3936 unsigned long last_index;
3937 unsigned long i;
3938 struct page *page;
3939 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3940 struct file_ra_state *ra;
3941 struct btrfs_ordered_extent *ordered;
3942 unsigned int total_read = 0;
3943 unsigned int total_dirty = 0;
3944 int ret = 0;
3945
3946 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3947
3948 mutex_lock(&inode->i_mutex);
3949 first_index = start >> PAGE_CACHE_SHIFT;
3950 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3951
3952 /* make sure the dirty trick played by the caller work */
3953 ret = invalidate_inode_pages2_range(inode->i_mapping,
3954 first_index, last_index);
3955 if (ret)
3956 goto out_unlock;
3957
3958 file_ra_state_init(ra, inode->i_mapping);
3959
3960 for (i = first_index ; i <= last_index; i++) {
3961 if (total_read % ra->ra_pages == 0) {
3962 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3963 calc_ra(i, last_index, ra->ra_pages));
3964 }
3965 total_read++;
3966again:
3967 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3968 BUG_ON(1);
3969 page = grab_cache_page(inode->i_mapping, i);
3970 if (!page) {
3971 ret = -ENOMEM;
3972 goto out_unlock;
3973 }
3974 if (!PageUptodate(page)) {
3975 btrfs_readpage(NULL, page);
3976 lock_page(page);
3977 if (!PageUptodate(page)) {
3978 unlock_page(page);
3979 page_cache_release(page);
3980 ret = -EIO;
3981 goto out_unlock;
3982 }
3983 }
3984 wait_on_page_writeback(page);
3985
3986 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3987 page_end = page_start + PAGE_CACHE_SIZE - 1;
3988 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3989
3990 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3991 if (ordered) {
3992 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3993 unlock_page(page);
3994 page_cache_release(page);
3995 btrfs_start_ordered_extent(inode, ordered, 1);
3996 btrfs_put_ordered_extent(ordered);
3997 goto again;
3998 }
3999 set_page_extent_mapped(page);
4000
4001 if (i == first_index)
4002 set_extent_bits(io_tree, page_start, page_end,
4003 EXTENT_BOUNDARY, GFP_NOFS);
4004 btrfs_set_extent_delalloc(inode, page_start, page_end);
4005
4006 set_page_dirty(page);
4007 total_dirty++;
4008
4009 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4010 unlock_page(page);
4011 page_cache_release(page);
4012 }
4013
4014out_unlock:
4015 kfree(ra);
4016 mutex_unlock(&inode->i_mutex);
4017 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4018 return ret;
4019}
4020
4021static noinline int relocate_data_extent(struct inode *reloc_inode,
4022 struct btrfs_key *extent_key,
4023 u64 offset)
4024{
4025 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4026 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4027 struct extent_map *em;
4028 u64 start = extent_key->objectid - offset;
4029 u64 end = start + extent_key->offset - 1;
4030
4031 em = alloc_extent_map(GFP_NOFS);
4032 BUG_ON(!em || IS_ERR(em));
4033
4034 em->start = start;
4035 em->len = extent_key->offset;
4036 em->block_len = extent_key->offset;
4037 em->block_start = extent_key->objectid;
4038 em->bdev = root->fs_info->fs_devices->latest_bdev;
4039 set_bit(EXTENT_FLAG_PINNED, &em->flags);
4040
4041 /* setup extent map to cheat btrfs_readpage */
4042 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4043 while (1) {
4044 int ret;
4045 spin_lock(&em_tree->lock);
4046 ret = add_extent_mapping(em_tree, em);
4047 spin_unlock(&em_tree->lock);
4048 if (ret != -EEXIST) {
4049 free_extent_map(em);
4050 break;
4051 }
4052 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4053 }
4054 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4055
4056 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4057}
4058
4059struct btrfs_ref_path {
4060 u64 extent_start;
4061 u64 nodes[BTRFS_MAX_LEVEL];
4062 u64 root_objectid;
4063 u64 root_generation;
4064 u64 owner_objectid;
4065 u32 num_refs;
4066 int lowest_level;
4067 int current_level;
4068 int shared_level;
4069
4070 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4071 u64 new_nodes[BTRFS_MAX_LEVEL];
4072};
4073
4074struct disk_extent {
4075 u64 ram_bytes;
4076 u64 disk_bytenr;
4077 u64 disk_num_bytes;
4078 u64 offset;
4079 u64 num_bytes;
4080 u8 compression;
4081 u8 encryption;
4082 u16 other_encoding;
4083};
4084
4085static int is_cowonly_root(u64 root_objectid)
4086{
4087 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4088 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4089 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4090 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4091 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4092 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4093 return 1;
4094 return 0;
4095}
4096
4097static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4098 struct btrfs_root *extent_root,
4099 struct btrfs_ref_path *ref_path,
4100 int first_time)
4101{
4102 struct extent_buffer *leaf;
4103 struct btrfs_path *path;
4104 struct btrfs_extent_ref *ref;
4105 struct btrfs_key key;
4106 struct btrfs_key found_key;
4107 u64 bytenr;
4108 u32 nritems;
4109 int level;
4110 int ret = 1;
4111
4112 path = btrfs_alloc_path();
4113 if (!path)
4114 return -ENOMEM;
4115
4116 if (first_time) {
4117 ref_path->lowest_level = -1;
4118 ref_path->current_level = -1;
4119 ref_path->shared_level = -1;
4120 goto walk_up;
4121 }
4122walk_down:
4123 level = ref_path->current_level - 1;
4124 while (level >= -1) {
4125 u64 parent;
4126 if (level < ref_path->lowest_level)
4127 break;
4128
4129 if (level >= 0)
4130 bytenr = ref_path->nodes[level];
4131 else
4132 bytenr = ref_path->extent_start;
4133 BUG_ON(bytenr == 0);
4134
4135 parent = ref_path->nodes[level + 1];
4136 ref_path->nodes[level + 1] = 0;
4137 ref_path->current_level = level;
4138 BUG_ON(parent == 0);
4139
4140 key.objectid = bytenr;
4141 key.offset = parent + 1;
4142 key.type = BTRFS_EXTENT_REF_KEY;
4143
4144 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4145 if (ret < 0)
4146 goto out;
4147 BUG_ON(ret == 0);
4148
4149 leaf = path->nodes[0];
4150 nritems = btrfs_header_nritems(leaf);
4151 if (path->slots[0] >= nritems) {
4152 ret = btrfs_next_leaf(extent_root, path);
4153 if (ret < 0)
4154 goto out;
4155 if (ret > 0)
4156 goto next;
4157 leaf = path->nodes[0];
4158 }
4159
4160 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4161 if (found_key.objectid == bytenr &&
4162 found_key.type == BTRFS_EXTENT_REF_KEY) {
4163 if (level < ref_path->shared_level)
4164 ref_path->shared_level = level;
4165 goto found;
4166 }
4167next:
4168 level--;
4169 btrfs_release_path(extent_root, path);
4170 cond_resched();
4171 }
4172 /* reached lowest level */
4173 ret = 1;
4174 goto out;
4175walk_up:
4176 level = ref_path->current_level;
4177 while (level < BTRFS_MAX_LEVEL - 1) {
4178 u64 ref_objectid;
4179
4180 if (level >= 0)
4181 bytenr = ref_path->nodes[level];
4182 else
4183 bytenr = ref_path->extent_start;
4184
4185 BUG_ON(bytenr == 0);
4186
4187 key.objectid = bytenr;
4188 key.offset = 0;
4189 key.type = BTRFS_EXTENT_REF_KEY;
4190
4191 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4192 if (ret < 0)
4193 goto out;
4194
4195 leaf = path->nodes[0];
4196 nritems = btrfs_header_nritems(leaf);
4197 if (path->slots[0] >= nritems) {
4198 ret = btrfs_next_leaf(extent_root, path);
4199 if (ret < 0)
4200 goto out;
4201 if (ret > 0) {
4202 /* the extent was freed by someone */
4203 if (ref_path->lowest_level == level)
4204 goto out;
4205 btrfs_release_path(extent_root, path);
4206 goto walk_down;
4207 }
4208 leaf = path->nodes[0];
4209 }
4210
4211 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4212 if (found_key.objectid != bytenr ||
4213 found_key.type != BTRFS_EXTENT_REF_KEY) {
4214 /* the extent was freed by someone */
4215 if (ref_path->lowest_level == level) {
4216 ret = 1;
4217 goto out;
4218 }
4219 btrfs_release_path(extent_root, path);
4220 goto walk_down;
4221 }
4222found:
4223 ref = btrfs_item_ptr(leaf, path->slots[0],
4224 struct btrfs_extent_ref);
4225 ref_objectid = btrfs_ref_objectid(leaf, ref);
4226 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4227 if (first_time) {
4228 level = (int)ref_objectid;
4229 BUG_ON(level >= BTRFS_MAX_LEVEL);
4230 ref_path->lowest_level = level;
4231 ref_path->current_level = level;
4232 ref_path->nodes[level] = bytenr;
4233 } else {
4234 WARN_ON(ref_objectid != level);
4235 }
4236 } else {
4237 WARN_ON(level != -1);
4238 }
4239 first_time = 0;
4240
4241 if (ref_path->lowest_level == level) {
4242 ref_path->owner_objectid = ref_objectid;
4243 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4244 }
4245
4246 /*
4247 * the block is tree root or the block isn't in reference
4248 * counted tree.
4249 */
4250 if (found_key.objectid == found_key.offset ||
4251 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4252 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4253 ref_path->root_generation =
4254 btrfs_ref_generation(leaf, ref);
4255 if (level < 0) {
4256 /* special reference from the tree log */
4257 ref_path->nodes[0] = found_key.offset;
4258 ref_path->current_level = 0;
4259 }
4260 ret = 0;
4261 goto out;
4262 }
4263
4264 level++;
4265 BUG_ON(ref_path->nodes[level] != 0);
4266 ref_path->nodes[level] = found_key.offset;
4267 ref_path->current_level = level;
4268
4269 /*
4270 * the reference was created in the running transaction,
4271 * no need to continue walking up.
4272 */
4273 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4274 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4275 ref_path->root_generation =
4276 btrfs_ref_generation(leaf, ref);
4277 ret = 0;
4278 goto out;
4279 }
4280
4281 btrfs_release_path(extent_root, path);
4282 cond_resched();
4283 }
4284 /* reached max tree level, but no tree root found. */
4285 BUG();
4286out:
4287 btrfs_free_path(path);
4288 return ret;
4289}
4290
4291static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4292 struct btrfs_root *extent_root,
4293 struct btrfs_ref_path *ref_path,
4294 u64 extent_start)
4295{
4296 memset(ref_path, 0, sizeof(*ref_path));
4297 ref_path->extent_start = extent_start;
4298
4299 return __next_ref_path(trans, extent_root, ref_path, 1);
4300}
4301
4302static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4303 struct btrfs_root *extent_root,
4304 struct btrfs_ref_path *ref_path)
4305{
4306 return __next_ref_path(trans, extent_root, ref_path, 0);
4307}
4308
4309static noinline int get_new_locations(struct inode *reloc_inode,
4310 struct btrfs_key *extent_key,
4311 u64 offset, int no_fragment,
4312 struct disk_extent **extents,
4313 int *nr_extents)
4314{
4315 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4316 struct btrfs_path *path;
4317 struct btrfs_file_extent_item *fi;
4318 struct extent_buffer *leaf;
4319 struct disk_extent *exts = *extents;
4320 struct btrfs_key found_key;
4321 u64 cur_pos;
4322 u64 last_byte;
4323 u32 nritems;
4324 int nr = 0;
4325 int max = *nr_extents;
4326 int ret;
4327
4328 WARN_ON(!no_fragment && *extents);
4329 if (!exts) {
4330 max = 1;
4331 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4332 if (!exts)
4333 return -ENOMEM;
4334 }
4335
4336 path = btrfs_alloc_path();
4337 BUG_ON(!path);
4338
4339 cur_pos = extent_key->objectid - offset;
4340 last_byte = extent_key->objectid + extent_key->offset;
4341 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4342 cur_pos, 0);
4343 if (ret < 0)
4344 goto out;
4345 if (ret > 0) {
4346 ret = -ENOENT;
4347 goto out;
4348 }
4349
4350 while (1) {
4351 leaf = path->nodes[0];
4352 nritems = btrfs_header_nritems(leaf);
4353 if (path->slots[0] >= nritems) {
4354 ret = btrfs_next_leaf(root, path);
4355 if (ret < 0)
4356 goto out;
4357 if (ret > 0)
4358 break;
4359 leaf = path->nodes[0];
4360 }
4361
4362 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4363 if (found_key.offset != cur_pos ||
4364 found_key.type != BTRFS_EXTENT_DATA_KEY ||
4365 found_key.objectid != reloc_inode->i_ino)
4366 break;
4367
4368 fi = btrfs_item_ptr(leaf, path->slots[0],
4369 struct btrfs_file_extent_item);
4370 if (btrfs_file_extent_type(leaf, fi) !=
4371 BTRFS_FILE_EXTENT_REG ||
4372 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4373 break;
4374
4375 if (nr == max) {
4376 struct disk_extent *old = exts;
4377 max *= 2;
4378 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4379 memcpy(exts, old, sizeof(*exts) * nr);
4380 if (old != *extents)
4381 kfree(old);
4382 }
4383
4384 exts[nr].disk_bytenr =
4385 btrfs_file_extent_disk_bytenr(leaf, fi);
4386 exts[nr].disk_num_bytes =
4387 btrfs_file_extent_disk_num_bytes(leaf, fi);
4388 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4389 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4390 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4391 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4392 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4393 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4394 fi);
4395 BUG_ON(exts[nr].offset > 0);
4396 BUG_ON(exts[nr].compression || exts[nr].encryption);
4397 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4398
4399 cur_pos += exts[nr].num_bytes;
4400 nr++;
4401
4402 if (cur_pos + offset >= last_byte)
4403 break;
4404
4405 if (no_fragment) {
4406 ret = 1;
4407 goto out;
4408 }
4409 path->slots[0]++;
4410 }
4411
4412 BUG_ON(cur_pos + offset > last_byte);
4413 if (cur_pos + offset < last_byte) {
4414 ret = -ENOENT;
4415 goto out;
4416 }
4417 ret = 0;
4418out:
4419 btrfs_free_path(path);
4420 if (ret) {
4421 if (exts != *extents)
4422 kfree(exts);
4423 } else {
4424 *extents = exts;
4425 *nr_extents = nr;
4426 }
4427 return ret;
4428}
4429
4430static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4431 struct btrfs_root *root,
4432 struct btrfs_path *path,
4433 struct btrfs_key *extent_key,
4434 struct btrfs_key *leaf_key,
4435 struct btrfs_ref_path *ref_path,
4436 struct disk_extent *new_extents,
4437 int nr_extents)
4438{
4439 struct extent_buffer *leaf;
4440 struct btrfs_file_extent_item *fi;
4441 struct inode *inode = NULL;
4442 struct btrfs_key key;
4443 u64 lock_start = 0;
4444 u64 lock_end = 0;
4445 u64 num_bytes;
4446 u64 ext_offset;
4447 u64 first_pos;
4448 u32 nritems;
4449 int nr_scaned = 0;
4450 int extent_locked = 0;
4451 int extent_type;
4452 int ret;
4453
4454 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid &&
4459 key.type < BTRFS_EXTENT_DATA_KEY)) {
4460 key.objectid = ref_path->owner_objectid;
4461 key.type = BTRFS_EXTENT_DATA_KEY;
4462 key.offset = 0;
4463 }
4464 }
4465
4466 while (1) {
4467 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4468 if (ret < 0)
4469 goto out;
4470
4471 leaf = path->nodes[0];
4472 nritems = btrfs_header_nritems(leaf);
4473next:
4474 if (extent_locked && ret > 0) {
4475 /*
4476 * the file extent item was modified by someone
4477 * before the extent got locked.
4478 */
4479 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4480 lock_end, GFP_NOFS);
4481 extent_locked = 0;
4482 }
4483
4484 if (path->slots[0] >= nritems) {
4485 if (++nr_scaned > 2)
4486 break;
4487
4488 BUG_ON(extent_locked);
4489 ret = btrfs_next_leaf(root, path);
4490 if (ret < 0)
4491 goto out;
4492 if (ret > 0)
4493 break;
4494 leaf = path->nodes[0];
4495 nritems = btrfs_header_nritems(leaf);
4496 }
4497
4498 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4499
4500 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4501 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset))
4505 break;
4506 }
4507
4508 if (inode && key.objectid != inode->i_ino) {
4509 BUG_ON(extent_locked);
4510 btrfs_release_path(root, path);
4511 mutex_unlock(&inode->i_mutex);
4512 iput(inode);
4513 inode = NULL;
4514 continue;
4515 }
4516
4517 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4518 path->slots[0]++;
4519 ret = 1;
4520 goto next;
4521 }
4522 fi = btrfs_item_ptr(leaf, path->slots[0],
4523 struct btrfs_file_extent_item);
4524 extent_type = btrfs_file_extent_type(leaf, fi);
4525 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4526 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4527 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4528 extent_key->objectid)) {
4529 path->slots[0]++;
4530 ret = 1;
4531 goto next;
4532 }
4533
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536
4537 if (first_pos > key.offset - ext_offset)
4538 first_pos = key.offset - ext_offset;
4539
4540 if (!extent_locked) {
4541 lock_start = key.offset;
4542 lock_end = lock_start + num_bytes - 1;
4543 } else {
4544 if (lock_start > key.offset ||
4545 lock_end + 1 < key.offset + num_bytes) {
4546 unlock_extent(&BTRFS_I(inode)->io_tree,
4547 lock_start, lock_end, GFP_NOFS);
4548 extent_locked = 0;
4549 }
4550 }
4551
4552 if (!inode) {
4553 btrfs_release_path(root, path);
4554
4555 inode = btrfs_iget_locked(root->fs_info->sb,
4556 key.objectid, root);
4557 if (inode->i_state & I_NEW) {
4558 BTRFS_I(inode)->root = root;
4559 BTRFS_I(inode)->location.objectid =
4560 key.objectid;
4561 BTRFS_I(inode)->location.type =
4562 BTRFS_INODE_ITEM_KEY;
4563 BTRFS_I(inode)->location.offset = 0;
4564 btrfs_read_locked_inode(inode);
4565 unlock_new_inode(inode);
4566 }
4567 /*
4568 * some code call btrfs_commit_transaction while
4569 * holding the i_mutex, so we can't use mutex_lock
4570 * here.
4571 */
4572 if (is_bad_inode(inode) ||
4573 !mutex_trylock(&inode->i_mutex)) {
4574 iput(inode);
4575 inode = NULL;
4576 key.offset = (u64)-1;
4577 goto skip;
4578 }
4579 }
4580
4581 if (!extent_locked) {
4582 struct btrfs_ordered_extent *ordered;
4583
4584 btrfs_release_path(root, path);
4585
4586 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4587 lock_end, GFP_NOFS);
4588 ordered = btrfs_lookup_first_ordered_extent(inode,
4589 lock_end);
4590 if (ordered &&
4591 ordered->file_offset <= lock_end &&
4592 ordered->file_offset + ordered->len > lock_start) {
4593 unlock_extent(&BTRFS_I(inode)->io_tree,
4594 lock_start, lock_end, GFP_NOFS);
4595 btrfs_start_ordered_extent(inode, ordered, 1);
4596 btrfs_put_ordered_extent(ordered);
4597 key.offset += num_bytes;
4598 goto skip;
4599 }
4600 if (ordered)
4601 btrfs_put_ordered_extent(ordered);
4602
4603 extent_locked = 1;
4604 continue;
4605 }
4606
4607 if (nr_extents == 1) {
4608 /* update extent pointer in place */
4609 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4610 new_extents[0].disk_bytenr);
4611 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4612 new_extents[0].disk_num_bytes);
4613 btrfs_mark_buffer_dirty(leaf);
4614
4615 btrfs_drop_extent_cache(inode, key.offset,
4616 key.offset + num_bytes - 1, 0);
4617
4618 ret = btrfs_inc_extent_ref(trans, root,
4619 new_extents[0].disk_bytenr,
4620 new_extents[0].disk_num_bytes,
4621 leaf->start,
4622 root->root_key.objectid,
4623 trans->transid,
4624 key.objectid);
4625 BUG_ON(ret);
4626
4627 ret = btrfs_free_extent(trans, root,
4628 extent_key->objectid,
4629 extent_key->offset,
4630 leaf->start,
4631 btrfs_header_owner(leaf),
4632 btrfs_header_generation(leaf),
4633 key.objectid, 0);
4634 BUG_ON(ret);
4635
4636 btrfs_release_path(root, path);
4637 key.offset += num_bytes;
4638 } else {
4639 BUG_ON(1);
4640#if 0
4641 u64 alloc_hint;
4642 u64 extent_len;
4643 int i;
4644 /*
4645 * drop old extent pointer at first, then insert the
4646 * new pointers one bye one
4647 */
4648 btrfs_release_path(root, path);
4649 ret = btrfs_drop_extents(trans, root, inode, key.offset,
4650 key.offset + num_bytes,
4651 key.offset, &alloc_hint);
4652 BUG_ON(ret);
4653
4654 for (i = 0; i < nr_extents; i++) {
4655 if (ext_offset >= new_extents[i].num_bytes) {
4656 ext_offset -= new_extents[i].num_bytes;
4657 continue;
4658 }
4659 extent_len = min(new_extents[i].num_bytes -
4660 ext_offset, num_bytes);
4661
4662 ret = btrfs_insert_empty_item(trans, root,
4663 path, &key,
4664 sizeof(*fi));
4665 BUG_ON(ret);
4666
4667 leaf = path->nodes[0];
4668 fi = btrfs_item_ptr(leaf, path->slots[0],
4669 struct btrfs_file_extent_item);
4670 btrfs_set_file_extent_generation(leaf, fi,
4671 trans->transid);
4672 btrfs_set_file_extent_type(leaf, fi,
4673 BTRFS_FILE_EXTENT_REG);
4674 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4675 new_extents[i].disk_bytenr);
4676 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4677 new_extents[i].disk_num_bytes);
4678 btrfs_set_file_extent_ram_bytes(leaf, fi,
4679 new_extents[i].ram_bytes);
4680
4681 btrfs_set_file_extent_compression(leaf, fi,
4682 new_extents[i].compression);
4683 btrfs_set_file_extent_encryption(leaf, fi,
4684 new_extents[i].encryption);
4685 btrfs_set_file_extent_other_encoding(leaf, fi,
4686 new_extents[i].other_encoding);
4687
4688 btrfs_set_file_extent_num_bytes(leaf, fi,
4689 extent_len);
4690 ext_offset += new_extents[i].offset;
4691 btrfs_set_file_extent_offset(leaf, fi,
4692 ext_offset);
4693 btrfs_mark_buffer_dirty(leaf);
4694
4695 btrfs_drop_extent_cache(inode, key.offset,
4696 key.offset + extent_len - 1, 0);
4697
4698 ret = btrfs_inc_extent_ref(trans, root,
4699 new_extents[i].disk_bytenr,
4700 new_extents[i].disk_num_bytes,
4701 leaf->start,
4702 root->root_key.objectid,
4703 trans->transid, key.objectid);
4704 BUG_ON(ret);
4705 btrfs_release_path(root, path);
4706
4707 inode_add_bytes(inode, extent_len);
4708
4709 ext_offset = 0;
4710 num_bytes -= extent_len;
4711 key.offset += extent_len;
4712
4713 if (num_bytes == 0)
4714 break;
4715 }
4716 BUG_ON(i >= nr_extents);
4717#endif
4718 }
4719
4720 if (extent_locked) {
4721 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4722 lock_end, GFP_NOFS);
4723 extent_locked = 0;
4724 }
4725skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset)
4728 break;
4729
4730 cond_resched();
4731 }
4732 ret = 0;
4733out:
4734 btrfs_release_path(root, path);
4735 if (inode) {
4736 mutex_unlock(&inode->i_mutex);
4737 if (extent_locked) {
4738 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4739 lock_end, GFP_NOFS);
4740 }
4741 iput(inode);
4742 }
4743 return ret;
4744}
4745
4746int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4747 struct btrfs_root *root,
4748 struct extent_buffer *buf, u64 orig_start)
4749{
4750 int level;
4751 int ret;
4752
4753 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4754 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4755
4756 level = btrfs_header_level(buf);
4757 if (level == 0) {
4758 struct btrfs_leaf_ref *ref;
4759 struct btrfs_leaf_ref *orig_ref;
4760
4761 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4762 if (!orig_ref)
4763 return -ENOENT;
4764
4765 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4766 if (!ref) {
4767 btrfs_free_leaf_ref(root, orig_ref);
4768 return -ENOMEM;
4769 }
4770
4771 ref->nritems = orig_ref->nritems;
4772 memcpy(ref->extents, orig_ref->extents,
4773 sizeof(ref->extents[0]) * ref->nritems);
4774
4775 btrfs_free_leaf_ref(root, orig_ref);
4776
4777 ref->root_gen = trans->transid;
4778 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf);
4781 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref);
4784 }
4785 return 0;
4786}
4787
4788static noinline int invalidate_extent_cache(struct btrfs_root *root,
4789 struct extent_buffer *leaf,
4790 struct btrfs_block_group_cache *group,
4791 struct btrfs_root *target_root)
4792{
4793 struct btrfs_key key;
4794 struct inode *inode = NULL;
4795 struct btrfs_file_extent_item *fi;
4796 u64 num_bytes;
4797 u64 skip_objectid = 0;
4798 u32 nritems;
4799 u32 i;
4800
4801 nritems = btrfs_header_nritems(leaf);
4802 for (i = 0; i < nritems; i++) {
4803 btrfs_item_key_to_cpu(leaf, &key, i);
4804 if (key.objectid == skip_objectid ||
4805 key.type != BTRFS_EXTENT_DATA_KEY)
4806 continue;
4807 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4808 if (btrfs_file_extent_type(leaf, fi) ==
4809 BTRFS_FILE_EXTENT_INLINE)
4810 continue;
4811 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4812 continue;
4813 if (!inode || inode->i_ino != key.objectid) {
4814 iput(inode);
4815 inode = btrfs_ilookup(target_root->fs_info->sb,
4816 key.objectid, target_root, 1);
4817 }
4818 if (!inode) {
4819 skip_objectid = key.objectid;
4820 continue;
4821 }
4822 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4823
4824 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4825 key.offset + num_bytes - 1, GFP_NOFS);
4826 btrfs_drop_extent_cache(inode, key.offset,
4827 key.offset + num_bytes - 1, 1);
4828 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4829 key.offset + num_bytes - 1, GFP_NOFS);
4830 cond_resched();
4831 }
4832 iput(inode);
4833 return 0;
4834}
4835
4836static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4837 struct btrfs_root *root,
4838 struct extent_buffer *leaf,
4839 struct btrfs_block_group_cache *group,
4840 struct inode *reloc_inode)
4841{
4842 struct btrfs_key key;
4843 struct btrfs_key extent_key;
4844 struct btrfs_file_extent_item *fi;
4845 struct btrfs_leaf_ref *ref;
4846 struct disk_extent *new_extent;
4847 u64 bytenr;
4848 u64 num_bytes;
4849 u32 nritems;
4850 u32 i;
4851 int ext_index;
4852 int nr_extent;
4853 int ret;
4854
4855 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4856 BUG_ON(!new_extent);
4857
4858 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4859 BUG_ON(!ref);
4860
4861 ext_index = -1;
4862 nritems = btrfs_header_nritems(leaf);
4863 for (i = 0; i < nritems; i++) {
4864 btrfs_item_key_to_cpu(leaf, &key, i);
4865 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4866 continue;
4867 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4868 if (btrfs_file_extent_type(leaf, fi) ==
4869 BTRFS_FILE_EXTENT_INLINE)
4870 continue;
4871 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4872 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4873 if (bytenr == 0)
4874 continue;
4875
4876 ext_index++;
4877 if (bytenr >= group->key.objectid + group->key.offset ||
4878 bytenr + num_bytes <= group->key.objectid)
4879 continue;
4880
4881 extent_key.objectid = bytenr;
4882 extent_key.offset = num_bytes;
4883 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4884 nr_extent = 1;
4885 ret = get_new_locations(reloc_inode, &extent_key,
4886 group->key.objectid, 1,
4887 &new_extent, &nr_extent);
4888 if (ret > 0)
4889 continue;
4890 BUG_ON(ret < 0);
4891
4892 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4893 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4894 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4895 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4896
4897 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4898 new_extent->disk_bytenr);
4899 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4900 new_extent->disk_num_bytes);
4901 btrfs_mark_buffer_dirty(leaf);
4902
4903 ret = btrfs_inc_extent_ref(trans, root,
4904 new_extent->disk_bytenr,
4905 new_extent->disk_num_bytes,
4906 leaf->start,
4907 root->root_key.objectid,
4908 trans->transid, key.objectid);
4909 BUG_ON(ret);
4910 ret = btrfs_free_extent(trans, root,
4911 bytenr, num_bytes, leaf->start,
4912 btrfs_header_owner(leaf),
4913 btrfs_header_generation(leaf),
4914 key.objectid, 0);
4915 BUG_ON(ret);
4916 cond_resched();
4917 }
4918 kfree(new_extent);
4919 BUG_ON(ext_index + 1 != ref->nritems);
4920 btrfs_free_leaf_ref(root, ref);
4921 return 0;
4922}
4923
4924int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4925 struct btrfs_root *root)
4926{
4927 struct btrfs_root *reloc_root;
4928 int ret;
4929
4930 if (root->reloc_root) {
4931 reloc_root = root->reloc_root;
4932 root->reloc_root = NULL;
4933 list_add(&reloc_root->dead_list,
4934 &root->fs_info->dead_reloc_roots);
4935
4936 btrfs_set_root_bytenr(&reloc_root->root_item,
4937 reloc_root->node->start);
4938 btrfs_set_root_level(&root->root_item,
4939 btrfs_header_level(reloc_root->node));
4940 memset(&reloc_root->root_item.drop_progress, 0,
4941 sizeof(struct btrfs_disk_key));
4942 reloc_root->root_item.drop_level = 0;
4943
4944 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4945 &reloc_root->root_key,
4946 &reloc_root->root_item);
4947 BUG_ON(ret);
4948 }
4949 return 0;
4950}
4951
4952int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4953{
4954 struct btrfs_trans_handle *trans;
4955 struct btrfs_root *reloc_root;
4956 struct btrfs_root *prev_root = NULL;
4957 struct list_head dead_roots;
4958 int ret;
4959 unsigned long nr;
4960
4961 INIT_LIST_HEAD(&dead_roots);
4962 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4963
4964 while (!list_empty(&dead_roots)) {
4965 reloc_root = list_entry(dead_roots.prev,
4966 struct btrfs_root, dead_list);
4967 list_del_init(&reloc_root->dead_list);
4968
4969 BUG_ON(reloc_root->commit_root != NULL);
4970 while (1) {
4971 trans = btrfs_join_transaction(root, 1);
4972 BUG_ON(!trans);
4973
4974 mutex_lock(&root->fs_info->drop_mutex);
4975 ret = btrfs_drop_snapshot(trans, reloc_root);
4976 if (ret != -EAGAIN)
4977 break;
4978 mutex_unlock(&root->fs_info->drop_mutex);
4979
4980 nr = trans->blocks_used;
4981 ret = btrfs_end_transaction(trans, root);
4982 BUG_ON(ret);
4983 btrfs_btree_balance_dirty(root, nr);
4984 }
4985
4986 free_extent_buffer(reloc_root->node);
4987
4988 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4989 &reloc_root->root_key);
4990 BUG_ON(ret);
4991 mutex_unlock(&root->fs_info->drop_mutex);
4992
4993 nr = trans->blocks_used;
4994 ret = btrfs_end_transaction(trans, root);
4995 BUG_ON(ret);
4996 btrfs_btree_balance_dirty(root, nr);
4997
4998 kfree(prev_root);
4999 prev_root = reloc_root;
5000 }
5001 if (prev_root) {
5002 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5003 kfree(prev_root);
5004 }
5005 return 0;
5006}
5007
5008int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5009{
5010 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5011 return 0;
5012}
5013
5014int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5015{
5016 struct btrfs_root *reloc_root;
5017 struct btrfs_trans_handle *trans;
5018 struct btrfs_key location;
5019 int found;
5020 int ret;
5021
5022 mutex_lock(&root->fs_info->tree_reloc_mutex);
5023 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5024 BUG_ON(ret);
5025 found = !list_empty(&root->fs_info->dead_reloc_roots);
5026 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5027
5028 if (found) {
5029 trans = btrfs_start_transaction(root, 1);
5030 BUG_ON(!trans);
5031 ret = btrfs_commit_transaction(trans, root);
5032 BUG_ON(ret);
5033 }
5034
5035 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5036 location.offset = (u64)-1;
5037 location.type = BTRFS_ROOT_ITEM_KEY;
5038
5039 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5040 BUG_ON(!reloc_root);
5041 btrfs_orphan_cleanup(reloc_root);
5042 return 0;
5043}
5044
5045static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5046 struct btrfs_root *root)
5047{
5048 struct btrfs_root *reloc_root;
5049 struct extent_buffer *eb;
5050 struct btrfs_root_item *root_item;
5051 struct btrfs_key root_key;
5052 int ret;
5053
5054 BUG_ON(!root->ref_cows);
5055 if (root->reloc_root)
5056 return 0;
5057
5058 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5059 BUG_ON(!root_item);
5060
5061 ret = btrfs_copy_root(trans, root, root->commit_root,
5062 &eb, BTRFS_TREE_RELOC_OBJECTID);
5063 BUG_ON(ret);
5064
5065 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5066 root_key.offset = root->root_key.objectid;
5067 root_key.type = BTRFS_ROOT_ITEM_KEY;
5068
5069 memcpy(root_item, &root->root_item, sizeof(root_item));
5070 btrfs_set_root_refs(root_item, 0);
5071 btrfs_set_root_bytenr(root_item, eb->start);
5072 btrfs_set_root_level(root_item, btrfs_header_level(eb));
5073 btrfs_set_root_generation(root_item, trans->transid);
5074
5075 btrfs_tree_unlock(eb);
5076 free_extent_buffer(eb);
5077
5078 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5079 &root_key, root_item);
5080 BUG_ON(ret);
5081 kfree(root_item);
5082
5083 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5084 &root_key);
5085 BUG_ON(!reloc_root);
5086 reloc_root->last_trans = trans->transid;
5087 reloc_root->commit_root = NULL;
5088 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5089
5090 root->reloc_root = reloc_root;
5091 return 0;
5092}
5093
5094/*
5095 * Core function of space balance.
5096 *
5097 * The idea is using reloc trees to relocate tree blocks in reference
5098 * counted roots. There is one reloc tree for each subvol, and all
5099 * reloc trees share same root key objectid. Reloc trees are snapshots
5100 * of the latest committed roots of subvols (root->commit_root).
5101 *
5102 * To relocate a tree block referenced by a subvol, there are two steps.
5103 * COW the block through subvol's reloc tree, then update block pointer
5104 * in the subvol to point to the new block. Since all reloc trees share
5105 * same root key objectid, doing special handing for tree blocks owned
5106 * by them is easy. Once a tree block has been COWed in one reloc tree,
5107 * we can use the resulting new block directly when the same block is
5108 * required to COW again through other reloc trees. By this way, relocated
5109 * tree blocks are shared between reloc trees, so they are also shared
5110 * between subvols.
5111 */
5112static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5113 struct btrfs_root *root,
5114 struct btrfs_path *path,
5115 struct btrfs_key *first_key,
5116 struct btrfs_ref_path *ref_path,
5117 struct btrfs_block_group_cache *group,
5118 struct inode *reloc_inode)
5119{
5120 struct btrfs_root *reloc_root;
5121 struct extent_buffer *eb = NULL;
5122 struct btrfs_key *keys;
5123 u64 *nodes;
5124 int level;
5125 int shared_level;
5126 int lowest_level = 0;
5127 int ret;
5128
5129 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5130 lowest_level = ref_path->owner_objectid;
5131
5132 if (!root->ref_cows) {
5133 path->lowest_level = lowest_level;
5134 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5135 BUG_ON(ret < 0);
5136 path->lowest_level = 0;
5137 btrfs_release_path(root, path);
5138 return 0;
5139 }
5140
5141 mutex_lock(&root->fs_info->tree_reloc_mutex);
5142 ret = init_reloc_tree(trans, root);
5143 BUG_ON(ret);
5144 reloc_root = root->reloc_root;
5145
5146 shared_level = ref_path->shared_level;
5147 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5148
5149 keys = ref_path->node_keys;
5150 nodes = ref_path->new_nodes;
5151 memset(&keys[shared_level + 1], 0,
5152 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5153 memset(&nodes[shared_level + 1], 0,
5154 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5155
5156 if (nodes[lowest_level] == 0) {
5157 path->lowest_level = lowest_level;
5158 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5159 0, 1);
5160 BUG_ON(ret);
5161 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5162 eb = path->nodes[level];
5163 if (!eb || eb == reloc_root->node)
5164 break;
5165 nodes[level] = eb->start;
5166 if (level == 0)
5167 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5168 else
5169 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5170 }
5171 if (nodes[0] &&
5172 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5173 eb = path->nodes[0];
5174 ret = replace_extents_in_leaf(trans, reloc_root, eb,
5175 group, reloc_inode);
5176 BUG_ON(ret);
5177 }
5178 btrfs_release_path(reloc_root, path);
5179 } else {
5180 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5181 lowest_level);
5182 BUG_ON(ret);
5183 }
5184
5185 /*
5186 * replace tree blocks in the fs tree with tree blocks in
5187 * the reloc tree.
5188 */
5189 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5190 BUG_ON(ret < 0);
5191
5192 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5193 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5194 0, 0);
5195 BUG_ON(ret);
5196 extent_buffer_get(path->nodes[0]);
5197 eb = path->nodes[0];
5198 btrfs_release_path(reloc_root, path);
5199 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5200 BUG_ON(ret);
5201 free_extent_buffer(eb);
5202 }
5203
5204 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5205 path->lowest_level = 0;
5206 return 0;
5207}
5208
5209static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5210 struct btrfs_root *root,
5211 struct btrfs_path *path,
5212 struct btrfs_key *first_key,
5213 struct btrfs_ref_path *ref_path)
5214{
5215 int ret;
5216
5217 ret = relocate_one_path(trans, root, path, first_key,
5218 ref_path, NULL, NULL);
5219 BUG_ON(ret);
5220
5221 if (root == root->fs_info->extent_root)
5222 btrfs_extent_post_op(trans, root);
5223
5224 return 0;
5225}
5226
5227static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5228 struct btrfs_root *extent_root,
5229 struct btrfs_path *path,
5230 struct btrfs_key *extent_key)
5231{
5232 int ret;
5233
5234 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5235 if (ret)
5236 goto out;
5237 ret = btrfs_del_item(trans, extent_root, path);
5238out:
5239 btrfs_release_path(extent_root, path);
5240 return ret;
5241}
5242
5243static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5244 struct btrfs_ref_path *ref_path)
5245{
5246 struct btrfs_key root_key;
5247
5248 root_key.objectid = ref_path->root_objectid;
5249 root_key.type = BTRFS_ROOT_ITEM_KEY;
5250 if (is_cowonly_root(ref_path->root_objectid))
5251 root_key.offset = 0;
5252 else
5253 root_key.offset = (u64)-1;
5254
5255 return btrfs_read_fs_root_no_name(fs_info, &root_key);
5256}
5257
5258static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5259 struct btrfs_path *path,
5260 struct btrfs_key *extent_key,
5261 struct btrfs_block_group_cache *group,
5262 struct inode *reloc_inode, int pass)
5263{
5264 struct btrfs_trans_handle *trans;
5265 struct btrfs_root *found_root;
5266 struct btrfs_ref_path *ref_path = NULL;
5267 struct disk_extent *new_extents = NULL;
5268 int nr_extents = 0;
5269 int loops;
5270 int ret;
5271 int level;
5272 struct btrfs_key first_key;
5273 u64 prev_block = 0;
5274
5275
5276 trans = btrfs_start_transaction(extent_root, 1);
5277 BUG_ON(!trans);
5278
5279 if (extent_key->objectid == 0) {
5280 ret = del_extent_zero(trans, extent_root, path, extent_key);
5281 goto out;
5282 }
5283
5284 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5285 if (!ref_path) {
5286 ret = -ENOMEM;
5287 goto out;
5288 }
5289
5290 for (loops = 0; ; loops++) {
5291 if (loops == 0) {
5292 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5293 extent_key->objectid);
5294 } else {
5295 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5296 }
5297 if (ret < 0)
5298 goto out;
5299 if (ret > 0)
5300 break;
5301
5302 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5303 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5304 continue;
5305
5306 found_root = read_ref_root(extent_root->fs_info, ref_path);
5307 BUG_ON(!found_root);
5308 /*
5309 * for reference counted tree, only process reference paths
5310 * rooted at the latest committed root.
5311 */
5312 if (found_root->ref_cows &&
5313 ref_path->root_generation != found_root->root_key.offset)
5314 continue;
5315
5316 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5317 if (pass == 0) {
5318 /*
5319 * copy data extents to new locations
5320 */
5321 u64 group_start = group->key.objectid;
5322 ret = relocate_data_extent(reloc_inode,
5323 extent_key,
5324 group_start);
5325 if (ret < 0)
5326 goto out;
5327 break;
5328 }
5329 level = 0;
5330 } else {
5331 level = ref_path->owner_objectid;
5332 }
5333
5334 if (prev_block != ref_path->nodes[level]) {
5335 struct extent_buffer *eb;
5336 u64 block_start = ref_path->nodes[level];
5337 u64 block_size = btrfs_level_size(found_root, level);
5338
5339 eb = read_tree_block(found_root, block_start,
5340 block_size, 0);
5341 btrfs_tree_lock(eb);
5342 BUG_ON(level != btrfs_header_level(eb));
5343
5344 if (level == 0)
5345 btrfs_item_key_to_cpu(eb, &first_key, 0);
5346 else
5347 btrfs_node_key_to_cpu(eb, &first_key, 0);
5348
5349 btrfs_tree_unlock(eb);
5350 free_extent_buffer(eb);
5351 prev_block = block_start;
5352 }
5353
5354 btrfs_record_root_in_trans(found_root);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /*
5357 * try to update data extent references while
5358 * keeping metadata shared between snapshots.
5359 */
5360 if (pass == 1) {
5361 ret = relocate_one_path(trans, found_root,
5362 path, &first_key, ref_path,
5363 group, reloc_inode);
5364 if (ret < 0)
5365 goto out;
5366 continue;
5367 }
5368 /*
5369 * use fallback method to process the remaining
5370 * references.
5371 */
5372 if (!new_extents) {
5373 u64 group_start = group->key.objectid;
5374 new_extents = kmalloc(sizeof(*new_extents),
5375 GFP_NOFS);
5376 nr_extents = 1;
5377 ret = get_new_locations(reloc_inode,
5378 extent_key,
5379 group_start, 1,
5380 &new_extents,
5381 &nr_extents);
5382 if (ret)
5383 goto out;
5384 }
5385 ret = replace_one_extent(trans, found_root,
5386 path, extent_key,
5387 &first_key, ref_path,
5388 new_extents, nr_extents);
5389 } else {
5390 ret = relocate_tree_block(trans, found_root, path,
5391 &first_key, ref_path);
5392 }
5393 if (ret < 0)
5394 goto out;
5395 }
5396 ret = 0;
5397out:
5398 btrfs_end_transaction(trans, extent_root);
5399 kfree(new_extents);
5400 kfree(ref_path);
5401 return ret;
5402}
5403
5404static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5405{
5406 u64 num_devices;
5407 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5408 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5409
5410 num_devices = root->fs_info->fs_devices->rw_devices;
5411 if (num_devices == 1) {
5412 stripped |= BTRFS_BLOCK_GROUP_DUP;
5413 stripped = flags & ~stripped;
5414
5415 /* turn raid0 into single device chunks */
5416 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5417 return stripped;
5418
5419 /* turn mirroring into duplication */
5420 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5421 BTRFS_BLOCK_GROUP_RAID10))
5422 return stripped | BTRFS_BLOCK_GROUP_DUP;
5423 return flags;
5424 } else {
5425 /* they already had raid on here, just return */
5426 if (flags & stripped)
5427 return flags;
5428
5429 stripped |= BTRFS_BLOCK_GROUP_DUP;
5430 stripped = flags & ~stripped;
5431
5432 /* switch duplicated blocks with raid1 */
5433 if (flags & BTRFS_BLOCK_GROUP_DUP)
5434 return stripped | BTRFS_BLOCK_GROUP_RAID1;
5435
5436 /* turn single device chunks into raid0 */
5437 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5438 }
5439 return flags;
5440}
5441
5442static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5443 struct btrfs_block_group_cache *shrink_block_group,
5444 int force)
5445{
5446 struct btrfs_trans_handle *trans;
5447 u64 new_alloc_flags;
5448 u64 calc;
5449
5450 spin_lock(&shrink_block_group->lock);
5451 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5452 spin_unlock(&shrink_block_group->lock);
5453
5454 trans = btrfs_start_transaction(root, 1);
5455 spin_lock(&shrink_block_group->lock);
5456
5457 new_alloc_flags = update_block_group_flags(root,
5458 shrink_block_group->flags);
5459 if (new_alloc_flags != shrink_block_group->flags) {
5460 calc =
5461 btrfs_block_group_used(&shrink_block_group->item);
5462 } else {
5463 calc = shrink_block_group->key.offset;
5464 }
5465 spin_unlock(&shrink_block_group->lock);
5466
5467 do_chunk_alloc(trans, root->fs_info->extent_root,
5468 calc + 2 * 1024 * 1024, new_alloc_flags, force);
5469
5470 btrfs_end_transaction(trans, root);
5471 } else
5472 spin_unlock(&shrink_block_group->lock);
5473 return 0;
5474}
5475
5476static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5477 struct btrfs_root *root,
5478 u64 objectid, u64 size)
5479{
5480 struct btrfs_path *path;
5481 struct btrfs_inode_item *item;
5482 struct extent_buffer *leaf;
5483 int ret;
5484
5485 path = btrfs_alloc_path();
5486 if (!path)
5487 return -ENOMEM;
5488
5489 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5490 if (ret)
5491 goto out;
5492
5493 leaf = path->nodes[0];
5494 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5495 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5496 btrfs_set_inode_generation(leaf, item, 1);
5497 btrfs_set_inode_size(leaf, item, size);
5498 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5499 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5500 btrfs_mark_buffer_dirty(leaf);
5501 btrfs_release_path(root, path);
5502out:
5503 btrfs_free_path(path);
5504 return ret;
5505}
5506
5507static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5508 struct btrfs_block_group_cache *group)
5509{
5510 struct inode *inode = NULL;
5511 struct btrfs_trans_handle *trans;
5512 struct btrfs_root *root;
5513 struct btrfs_key root_key;
5514 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5515 int err = 0;
5516
5517 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5518 root_key.type = BTRFS_ROOT_ITEM_KEY;
5519 root_key.offset = (u64)-1;
5520 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5521 if (IS_ERR(root))
5522 return ERR_CAST(root);
5523
5524 trans = btrfs_start_transaction(root, 1);
5525 BUG_ON(!trans);
5526
5527 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5528 if (err)
5529 goto out;
5530
5531 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5532 BUG_ON(err);
5533
5534 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5535 group->key.offset, 0, group->key.offset,
5536 0, 0, 0);
5537 BUG_ON(err);
5538
5539 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5540 if (inode->i_state & I_NEW) {
5541 BTRFS_I(inode)->root = root;
5542 BTRFS_I(inode)->location.objectid = objectid;
5543 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5544 BTRFS_I(inode)->location.offset = 0;
5545 btrfs_read_locked_inode(inode);
5546 unlock_new_inode(inode);
5547 BUG_ON(is_bad_inode(inode));
5548 } else {
5549 BUG_ON(1);
5550 }
5551 BTRFS_I(inode)->index_cnt = group->key.objectid;
5552
5553 err = btrfs_orphan_add(trans, inode);
5554out:
5555 btrfs_end_transaction(trans, root);
5556 if (err) {
5557 if (inode)
5558 iput(inode);
5559 inode = ERR_PTR(err);
5560 }
5561 return inode;
5562}
5563
5564int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5565{
5566
5567 struct btrfs_ordered_sum *sums;
5568 struct btrfs_sector_sum *sector_sum;
5569 struct btrfs_ordered_extent *ordered;
5570 struct btrfs_root *root = BTRFS_I(inode)->root;
5571 struct list_head list;
5572 size_t offset;
5573 int ret;
5574 u64 disk_bytenr;
5575
5576 INIT_LIST_HEAD(&list);
5577
5578 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5579 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5580
5581 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5582 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5583 disk_bytenr + len - 1, &list);
5584
5585 while (!list_empty(&list)) {
5586 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5587 list_del_init(&sums->list);
5588
5589 sector_sum = sums->sums;
5590 sums->bytenr = ordered->start;
5591
5592 offset = 0;
5593 while (offset < sums->len) {
5594 sector_sum->bytenr += ordered->start - disk_bytenr;
5595 sector_sum++;
5596 offset += root->sectorsize;
5597 }
5598
5599 btrfs_add_ordered_sum(inode, ordered, sums);
5600 }
5601 btrfs_put_ordered_extent(ordered);
5602 return 0;
5603}
5604
5605int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5606{
5607 struct btrfs_trans_handle *trans;
5608 struct btrfs_path *path;
5609 struct btrfs_fs_info *info = root->fs_info;
5610 struct extent_buffer *leaf;
5611 struct inode *reloc_inode;
5612 struct btrfs_block_group_cache *block_group;
5613 struct btrfs_key key;
5614 u64 skipped;
5615 u64 cur_byte;
5616 u64 total_found;
5617 u32 nritems;
5618 int ret;
5619 int progress;
5620 int pass = 0;
5621
5622 root = root->fs_info->extent_root;
5623
5624 block_group = btrfs_lookup_block_group(info, group_start);
5625 BUG_ON(!block_group);
5626
5627 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5628 (unsigned long long)block_group->key.objectid,
5629 (unsigned long long)block_group->flags);
5630
5631 path = btrfs_alloc_path();
5632 BUG_ON(!path);
5633
5634 reloc_inode = create_reloc_inode(info, block_group);
5635 BUG_ON(IS_ERR(reloc_inode));
5636
5637 __alloc_chunk_for_shrink(root, block_group, 1);
5638 set_block_group_readonly(block_group);
5639
5640 btrfs_start_delalloc_inodes(info->tree_root);
5641 btrfs_wait_ordered_extents(info->tree_root, 0);
5642again:
5643 skipped = 0;
5644 total_found = 0;
5645 progress = 0;
5646 key.objectid = block_group->key.objectid;
5647 key.offset = 0;
5648 key.type = 0;
5649 cur_byte = key.objectid;
5650
5651 trans = btrfs_start_transaction(info->tree_root, 1);
5652 btrfs_commit_transaction(trans, info->tree_root);
5653
5654 mutex_lock(&root->fs_info->cleaner_mutex);
5655 btrfs_clean_old_snapshots(info->tree_root);
5656 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5657 mutex_unlock(&root->fs_info->cleaner_mutex);
5658
5659 while (1) {
5660 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5661 if (ret < 0)
5662 goto out;
5663next:
5664 leaf = path->nodes[0];
5665 nritems = btrfs_header_nritems(leaf);
5666 if (path->slots[0] >= nritems) {
5667 ret = btrfs_next_leaf(root, path);
5668 if (ret < 0)
5669 goto out;
5670 if (ret == 1) {
5671 ret = 0;
5672 break;
5673 }
5674 leaf = path->nodes[0];
5675 nritems = btrfs_header_nritems(leaf);
5676 }
5677
5678 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5679
5680 if (key.objectid >= block_group->key.objectid +
5681 block_group->key.offset)
5682 break;
5683
5684 if (progress && need_resched()) {
5685 btrfs_release_path(root, path);
5686 cond_resched();
5687 progress = 0;
5688 continue;
5689 }
5690 progress = 1;
5691
5692 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5693 key.objectid + key.offset <= cur_byte) {
5694 path->slots[0]++;
5695 goto next;
5696 }
5697
5698 total_found++;
5699 cur_byte = key.objectid + key.offset;
5700 btrfs_release_path(root, path);
5701
5702 __alloc_chunk_for_shrink(root, block_group, 0);
5703 ret = relocate_one_extent(root, path, &key, block_group,
5704 reloc_inode, pass);
5705 BUG_ON(ret < 0);
5706 if (ret > 0)
5707 skipped++;
5708
5709 key.objectid = cur_byte;
5710 key.type = 0;
5711 key.offset = 0;
5712 }
5713
5714 btrfs_release_path(root, path);
5715
5716 if (pass == 0) {
5717 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5718 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5719 }
5720
5721 if (total_found > 0) {
5722 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5723 (unsigned long long)total_found, pass);
5724 pass++;
5725 if (total_found == skipped && pass > 2) {
5726 iput(reloc_inode);
5727 reloc_inode = create_reloc_inode(info, block_group);
5728 pass = 0;
5729 }
5730 goto again;
5731 }
5732
5733 /* delete reloc_inode */
5734 iput(reloc_inode);
5735
5736 /* unpin extents in this range */
5737 trans = btrfs_start_transaction(info->tree_root, 1);
5738 btrfs_commit_transaction(trans, info->tree_root);
5739
5740 spin_lock(&block_group->lock);
5741 WARN_ON(block_group->pinned > 0);
5742 WARN_ON(block_group->reserved > 0);
5743 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5744 spin_unlock(&block_group->lock);
5745 put_block_group(block_group);
5746 ret = 0;
5747out:
5748 btrfs_free_path(path);
5749 return ret;
5750}
5751
5752static int find_first_block_group(struct btrfs_root *root,
5753 struct btrfs_path *path, struct btrfs_key *key)
5754{
5755 int ret = 0;
5756 struct btrfs_key found_key;
5757 struct extent_buffer *leaf;
5758 int slot;
5759
5760 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5761 if (ret < 0)
5762 goto out;
5763
5764 while (1) {
5765 slot = path->slots[0];
5766 leaf = path->nodes[0];
5767 if (slot >= btrfs_header_nritems(leaf)) {
5768 ret = btrfs_next_leaf(root, path);
5769 if (ret == 0)
5770 continue;
5771 if (ret < 0)
5772 goto out;
5773 break;
5774 }
5775 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5776
5777 if (found_key.objectid >= key->objectid &&
5778 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5779 ret = 0;
5780 goto out;
5781 }
5782 path->slots[0]++;
5783 }
5784 ret = -ENOENT;
5785out:
5786 return ret;
5787}
5788
5789int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{
5791 struct btrfs_block_group_cache *block_group;
5792 struct rb_node *n;
5793
5794 spin_lock(&info->block_group_cache_lock);
5795 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5796 block_group = rb_entry(n, struct btrfs_block_group_cache,
5797 cache_node);
5798 rb_erase(&block_group->cache_node,
5799 &info->block_group_cache_tree);
5800 spin_unlock(&info->block_group_cache_lock);
5801
5802 btrfs_remove_free_space_cache(block_group);
5803 down_write(&block_group->space_info->groups_sem);
5804 list_del(&block_group->list);
5805 up_write(&block_group->space_info->groups_sem);
5806
5807 WARN_ON(atomic_read(&block_group->count) != 1);
5808 kfree(block_group);
5809
5810 spin_lock(&info->block_group_cache_lock);
5811 }
5812 spin_unlock(&info->block_group_cache_lock);
5813 return 0;
5814}
5815
5816int btrfs_read_block_groups(struct btrfs_root *root)
5817{
5818 struct btrfs_path *path;
5819 int ret;
5820 struct btrfs_block_group_cache *cache;
5821 struct btrfs_fs_info *info = root->fs_info;
5822 struct btrfs_space_info *space_info;
5823 struct btrfs_key key;
5824 struct btrfs_key found_key;
5825 struct extent_buffer *leaf;
5826
5827 root = info->extent_root;
5828 key.objectid = 0;
5829 key.offset = 0;
5830 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5831 path = btrfs_alloc_path();
5832 if (!path)
5833 return -ENOMEM;
5834
5835 while (1) {
5836 ret = find_first_block_group(root, path, &key);
5837 if (ret > 0) {
5838 ret = 0;
5839 goto error;
5840 }
5841 if (ret != 0)
5842 goto error;
5843
5844 leaf = path->nodes[0];
5845 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5846 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5847 if (!cache) {
5848 ret = -ENOMEM;
5849 break;
5850 }
5851
5852 atomic_set(&cache->count, 1);
5853 spin_lock_init(&cache->lock);
5854 mutex_init(&cache->alloc_mutex);
5855 mutex_init(&cache->cache_mutex);
5856 INIT_LIST_HEAD(&cache->list);
5857 read_extent_buffer(leaf, &cache->item,
5858 btrfs_item_ptr_offset(leaf, path->slots[0]),
5859 sizeof(cache->item));
5860 memcpy(&cache->key, &found_key, sizeof(found_key));
5861
5862 key.objectid = found_key.objectid + found_key.offset;
5863 btrfs_release_path(root, path);
5864 cache->flags = btrfs_block_group_flags(&cache->item);
5865
5866 ret = update_space_info(info, cache->flags, found_key.offset,
5867 btrfs_block_group_used(&cache->item),
5868 &space_info);
5869 BUG_ON(ret);
5870 cache->space_info = space_info;
5871 down_write(&space_info->groups_sem);
5872 list_add_tail(&cache->list, &space_info->block_groups);
5873 up_write(&space_info->groups_sem);
5874
5875 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5876 BUG_ON(ret);
5877
5878 set_avail_alloc_bits(root->fs_info, cache->flags);
5879 if (btrfs_chunk_readonly(root, cache->key.objectid))
5880 set_block_group_readonly(cache);
5881 }
5882 ret = 0;
5883error:
5884 btrfs_free_path(path);
5885 return ret;
5886}
5887
5888int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5889 struct btrfs_root *root, u64 bytes_used,
5890 u64 type, u64 chunk_objectid, u64 chunk_offset,
5891 u64 size)
5892{
5893 int ret;
5894 struct btrfs_root *extent_root;
5895 struct btrfs_block_group_cache *cache;
5896
5897 extent_root = root->fs_info->extent_root;
5898
5899 root->fs_info->last_trans_new_blockgroup = trans->transid;
5900
5901 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5902 if (!cache)
5903 return -ENOMEM;
5904
5905 cache->key.objectid = chunk_offset;
5906 cache->key.offset = size;
5907 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5908 atomic_set(&cache->count, 1);
5909 spin_lock_init(&cache->lock);
5910 mutex_init(&cache->alloc_mutex);
5911 mutex_init(&cache->cache_mutex);
5912 INIT_LIST_HEAD(&cache->list);
5913
5914 btrfs_set_block_group_used(&cache->item, bytes_used);
5915 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5916 cache->flags = type;
5917 btrfs_set_block_group_flags(&cache->item, type);
5918
5919 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5920 &cache->space_info);
5921 BUG_ON(ret);
5922 down_write(&cache->space_info->groups_sem);
5923 list_add_tail(&cache->list, &cache->space_info->block_groups);
5924 up_write(&cache->space_info->groups_sem);
5925
5926 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5927 BUG_ON(ret);
5928
5929 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5930 sizeof(cache->item));
5931 BUG_ON(ret);
5932
5933 finish_current_insert(trans, extent_root, 0);
5934 ret = del_pending_extents(trans, extent_root, 0);
5935 BUG_ON(ret);
5936 set_avail_alloc_bits(extent_root->fs_info, type);
5937
5938 return 0;
5939}
5940
5941int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5942 struct btrfs_root *root, u64 group_start)
5943{
5944 struct btrfs_path *path;
5945 struct btrfs_block_group_cache *block_group;
5946 struct btrfs_key key;
5947 int ret;
5948
5949 root = root->fs_info->extent_root;
5950
5951 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5952 BUG_ON(!block_group);
5953 BUG_ON(!block_group->ro);
5954
5955 memcpy(&key, &block_group->key, sizeof(key));
5956
5957 path = btrfs_alloc_path();
5958 BUG_ON(!path);
5959
5960 btrfs_remove_free_space_cache(block_group);
5961 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree);
5963 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem);
5966
5967 spin_lock(&block_group->space_info->lock);
5968 block_group->space_info->total_bytes -= block_group->key.offset;
5969 block_group->space_info->bytes_readonly -= block_group->key.offset;
5970 spin_unlock(&block_group->space_info->lock);
5971 block_group->space_info->full = 0;
5972
5973 put_block_group(block_group);
5974 put_block_group(block_group);
5975
5976 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5977 if (ret > 0)
5978 ret = -EIO;
5979 if (ret < 0)
5980 goto out;
5981
5982 ret = btrfs_del_item(trans, root, path);
5983out:
5984 btrfs_free_path(path);
5985 return ret;
5986}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock);
36#endif
37
38#define BUFFER_LRU_MAX 64
39
40struct tree_entry {
41 u64 start;
42 u64 end;
43 struct rb_node rb_node;
44};
45
46struct extent_page_data {
47 struct bio *bio;
48 struct extent_io_tree *tree;
49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
55};
56
57int __init extent_io_init(void)
58{
59 extent_state_cache = btrfs_cache_create("extent_state",
60 sizeof(struct extent_state), 0,
61 NULL);
62 if (!extent_state_cache)
63 return -ENOMEM;
64
65 extent_buffer_cache = btrfs_cache_create("extent_buffers",
66 sizeof(struct extent_buffer), 0,
67 NULL);
68 if (!extent_buffer_cache)
69 goto free_state_cache;
70 return 0;
71
72free_state_cache:
73 kmem_cache_destroy(extent_state_cache);
74 return -ENOMEM;
75}
76
77void extent_io_exit(void)
78{
79 struct extent_state *state;
80 struct extent_buffer *eb;
81
82 while (!list_empty(&states)) {
83 state = list_entry(states.next, struct extent_state, leak_list);
84 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
85 "state %lu in tree %p refs %d\n",
86 (unsigned long long)state->start,
87 (unsigned long long)state->end,
88 state->state, state->tree, atomic_read(&state->refs));
89 list_del(&state->leak_list);
90 kmem_cache_free(extent_state_cache, state);
91
92 }
93
94 while (!list_empty(&buffers)) {
95 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
96 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
97 "refs %d\n", (unsigned long long)eb->start,
98 eb->len, atomic_read(&eb->refs));
99 list_del(&eb->leak_list);
100 kmem_cache_free(extent_buffer_cache, eb);
101 }
102 if (extent_state_cache)
103 kmem_cache_destroy(extent_state_cache);
104 if (extent_buffer_cache)
105 kmem_cache_destroy(extent_buffer_cache);
106}
107
108void extent_io_tree_init(struct extent_io_tree *tree,
109 struct address_space *mapping, gfp_t mask)
110{
111 tree->state.rb_node = NULL;
112 tree->buffer.rb_node = NULL;
113 tree->ops = NULL;
114 tree->dirty_bytes = 0;
115 spin_lock_init(&tree->lock);
116 spin_lock_init(&tree->buffer_lock);
117 tree->mapping = mapping;
118}
119
120static struct extent_state *alloc_extent_state(gfp_t mask)
121{
122 struct extent_state *state;
123#ifdef LEAK_DEBUG
124 unsigned long flags;
125#endif
126
127 state = kmem_cache_alloc(extent_state_cache, mask);
128 if (!state)
129 return state;
130 state->state = 0;
131 state->private = 0;
132 state->tree = NULL;
133#ifdef LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags);
137#endif
138 atomic_set(&state->refs, 1);
139 init_waitqueue_head(&state->wq);
140 return state;
141}
142
143static void free_extent_state(struct extent_state *state)
144{
145 if (!state)
146 return;
147 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG
149 unsigned long flags;
150#endif
151 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags);
156#endif
157 kmem_cache_free(extent_state_cache, state);
158 }
159}
160
161static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
162 struct rb_node *node)
163{
164 struct rb_node **p = &root->rb_node;
165 struct rb_node *parent = NULL;
166 struct tree_entry *entry;
167
168 while (*p) {
169 parent = *p;
170 entry = rb_entry(parent, struct tree_entry, rb_node);
171
172 if (offset < entry->start)
173 p = &(*p)->rb_left;
174 else if (offset > entry->end)
175 p = &(*p)->rb_right;
176 else
177 return parent;
178 }
179
180 entry = rb_entry(node, struct tree_entry, rb_node);
181 rb_link_node(node, parent, p);
182 rb_insert_color(node, root);
183 return NULL;
184}
185
186static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
187 struct rb_node **prev_ret,
188 struct rb_node **next_ret)
189{
190 struct rb_root *root = &tree->state;
191 struct rb_node *n = root->rb_node;
192 struct rb_node *prev = NULL;
193 struct rb_node *orig_prev = NULL;
194 struct tree_entry *entry;
195 struct tree_entry *prev_entry = NULL;
196
197 while (n) {
198 entry = rb_entry(n, struct tree_entry, rb_node);
199 prev = n;
200 prev_entry = entry;
201
202 if (offset < entry->start)
203 n = n->rb_left;
204 else if (offset > entry->end)
205 n = n->rb_right;
206 else
207 return n;
208 }
209
210 if (prev_ret) {
211 orig_prev = prev;
212 while (prev && offset > prev_entry->end) {
213 prev = rb_next(prev);
214 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
215 }
216 *prev_ret = prev;
217 prev = orig_prev;
218 }
219
220 if (next_ret) {
221 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222 while (prev && offset < prev_entry->start) {
223 prev = rb_prev(prev);
224 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
225 }
226 *next_ret = prev;
227 }
228 return NULL;
229}
230
231static inline struct rb_node *tree_search(struct extent_io_tree *tree,
232 u64 offset)
233{
234 struct rb_node *prev = NULL;
235 struct rb_node *ret;
236
237 ret = __etree_search(tree, offset, &prev, NULL);
238 if (!ret)
239 return prev;
240 return ret;
241}
242
243static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
244 u64 offset, struct rb_node *node)
245{
246 struct rb_root *root = &tree->buffer;
247 struct rb_node **p = &root->rb_node;
248 struct rb_node *parent = NULL;
249 struct extent_buffer *eb;
250
251 while (*p) {
252 parent = *p;
253 eb = rb_entry(parent, struct extent_buffer, rb_node);
254
255 if (offset < eb->start)
256 p = &(*p)->rb_left;
257 else if (offset > eb->start)
258 p = &(*p)->rb_right;
259 else
260 return eb;
261 }
262
263 rb_link_node(node, parent, p);
264 rb_insert_color(node, root);
265 return NULL;
266}
267
268static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
269 u64 offset)
270{
271 struct rb_root *root = &tree->buffer;
272 struct rb_node *n = root->rb_node;
273 struct extent_buffer *eb;
274
275 while (n) {
276 eb = rb_entry(n, struct extent_buffer, rb_node);
277 if (offset < eb->start)
278 n = n->rb_left;
279 else if (offset > eb->start)
280 n = n->rb_right;
281 else
282 return eb;
283 }
284 return NULL;
285}
286
287/*
288 * utility function to look for merge candidates inside a given range.
289 * Any extents with matching state are merged together into a single
290 * extent in the tree. Extents with EXTENT_IO in their state field
291 * are not merged because the end_io handlers need to be able to do
292 * operations on them without sleeping (or doing allocations/splits).
293 *
294 * This should be called with the tree lock held.
295 */
296static int merge_state(struct extent_io_tree *tree,
297 struct extent_state *state)
298{
299 struct extent_state *other;
300 struct rb_node *other_node;
301
302 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
303 return 0;
304
305 other_node = rb_prev(&state->rb_node);
306 if (other_node) {
307 other = rb_entry(other_node, struct extent_state, rb_node);
308 if (other->end == state->start - 1 &&
309 other->state == state->state) {
310 state->start = other->start;
311 other->tree = NULL;
312 rb_erase(&other->rb_node, &tree->state);
313 free_extent_state(other);
314 }
315 }
316 other_node = rb_next(&state->rb_node);
317 if (other_node) {
318 other = rb_entry(other_node, struct extent_state, rb_node);
319 if (other->start == state->end + 1 &&
320 other->state == state->state) {
321 other->start = state->start;
322 state->tree = NULL;
323 rb_erase(&state->rb_node, &tree->state);
324 free_extent_state(state);
325 }
326 }
327 return 0;
328}
329
330static void set_state_cb(struct extent_io_tree *tree,
331 struct extent_state *state,
332 unsigned long bits)
333{
334 if (tree->ops && tree->ops->set_bit_hook) {
335 tree->ops->set_bit_hook(tree->mapping->host, state->start,
336 state->end, state->state, bits);
337 }
338}
339
340static void clear_state_cb(struct extent_io_tree *tree,
341 struct extent_state *state,
342 unsigned long bits)
343{
344 if (tree->ops && tree->ops->clear_bit_hook) {
345 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
346 state->end, state->state, bits);
347 }
348}
349
350/*
351 * insert an extent_state struct into the tree. 'bits' are set on the
352 * struct before it is inserted.
353 *
354 * This may return -EEXIST if the extent is already there, in which case the
355 * state struct is freed.
356 *
357 * The tree lock is not taken internally. This is a utility function and
358 * probably isn't what you want to call (see set/clear_extent_bit).
359 */
360static int insert_state(struct extent_io_tree *tree,
361 struct extent_state *state, u64 start, u64 end,
362 int bits)
363{
364 struct rb_node *node;
365
366 if (end < start) {
367 printk(KERN_ERR "btrfs end < start %llu %llu\n",
368 (unsigned long long)end,
369 (unsigned long long)start);
370 WARN_ON(1);
371 }
372 if (bits & EXTENT_DIRTY)
373 tree->dirty_bytes += end - start + 1;
374 set_state_cb(tree, state, bits);
375 state->state |= bits;
376 state->start = start;
377 state->end = end;
378 node = tree_insert(&tree->state, end, &state->rb_node);
379 if (node) {
380 struct extent_state *found;
381 found = rb_entry(node, struct extent_state, rb_node);
382 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
383 "%llu %llu\n", (unsigned long long)found->start,
384 (unsigned long long)found->end,
385 (unsigned long long)start, (unsigned long long)end);
386 free_extent_state(state);
387 return -EEXIST;
388 }
389 state->tree = tree;
390 merge_state(tree, state);
391 return 0;
392}
393
394/*
395 * split a given extent state struct in two, inserting the preallocated
396 * struct 'prealloc' as the newly created second half. 'split' indicates an
397 * offset inside 'orig' where it should be split.
398 *
399 * Before calling,
400 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
401 * are two extent state structs in the tree:
402 * prealloc: [orig->start, split - 1]
403 * orig: [ split, orig->end ]
404 *
405 * The tree locks are not taken by this function. They need to be held
406 * by the caller.
407 */
408static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
409 struct extent_state *prealloc, u64 split)
410{
411 struct rb_node *node;
412 prealloc->start = orig->start;
413 prealloc->end = split - 1;
414 prealloc->state = orig->state;
415 orig->start = split;
416
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc);
422 return -EEXIST;
423 }
424 prealloc->tree = tree;
425 return 0;
426}
427
428/*
429 * utility function to clear some bits in an extent state struct.
430 * it will optionally wake up any one waiting on this state (wake == 1), or
431 * forcibly remove the state from the tree (delete == 1).
432 *
433 * If no bits are set on the state struct after clearing things, the
434 * struct is freed and removed from the tree
435 */
436static int clear_state_bit(struct extent_io_tree *tree,
437 struct extent_state *state, int bits, int wake,
438 int delete)
439{
440 int ret = state->state & bits;
441
442 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
443 u64 range = state->end - state->start + 1;
444 WARN_ON(range > tree->dirty_bytes);
445 tree->dirty_bytes -= range;
446 }
447 clear_state_cb(tree, state, bits);
448 state->state &= ~bits;
449 if (wake)
450 wake_up(&state->wq);
451 if (delete || state->state == 0) {
452 if (state->tree) {
453 clear_state_cb(tree, state, state->state);
454 rb_erase(&state->rb_node, &tree->state);
455 state->tree = NULL;
456 free_extent_state(state);
457 } else {
458 WARN_ON(1);
459 }
460 } else {
461 merge_state(tree, state);
462 }
463 return ret;
464}
465
466/*
467 * clear some bits on a range in the tree. This may require splitting
468 * or inserting elements in the tree, so the gfp mask is used to
469 * indicate which allocations or sleeping are allowed.
470 *
471 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
472 * the given range from the tree regardless of state (ie for truncate).
473 *
474 * the range [start, end] is inclusive.
475 *
476 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
477 * bits were already set, or zero if none of the bits were already set.
478 */
479int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
480 int bits, int wake, int delete, gfp_t mask)
481{
482 struct extent_state *state;
483 struct extent_state *prealloc = NULL;
484 struct rb_node *node;
485 int err;
486 int set = 0;
487
488again:
489 if (!prealloc && (mask & __GFP_WAIT)) {
490 prealloc = alloc_extent_state(mask);
491 if (!prealloc)
492 return -ENOMEM;
493 }
494
495 spin_lock(&tree->lock);
496 /*
497 * this search will find the extents that end after
498 * our range starts
499 */
500 node = tree_search(tree, start);
501 if (!node)
502 goto out;
503 state = rb_entry(node, struct extent_state, rb_node);
504 if (state->start > end)
505 goto out;
506 WARN_ON(state->end < start);
507
508 /*
509 * | ---- desired range ---- |
510 * | state | or
511 * | ------------- state -------------- |
512 *
513 * We need to split the extent we found, and may flip
514 * bits on second half.
515 *
516 * If the extent we found extends past our range, we
517 * just split and search again. It'll get split again
518 * the next time though.
519 *
520 * If the extent we found is inside our range, we clear
521 * the desired bit on it.
522 */
523
524 if (state->start < start) {
525 if (!prealloc)
526 prealloc = alloc_extent_state(GFP_ATOMIC);
527 err = split_state(tree, state, prealloc, start);
528 BUG_ON(err == -EEXIST);
529 prealloc = NULL;
530 if (err)
531 goto out;
532 if (state->end <= end) {
533 start = state->end + 1;
534 set |= clear_state_bit(tree, state, bits,
535 wake, delete);
536 } else {
537 start = state->start;
538 }
539 goto search_again;
540 }
541 /*
542 * | ---- desired range ---- |
543 * | state |
544 * We need to split the extent, and clear the bit
545 * on the first half
546 */
547 if (state->start <= end && state->end > end) {
548 if (!prealloc)
549 prealloc = alloc_extent_state(GFP_ATOMIC);
550 err = split_state(tree, state, prealloc, end + 1);
551 BUG_ON(err == -EEXIST);
552
553 if (wake)
554 wake_up(&state->wq);
555 set |= clear_state_bit(tree, prealloc, bits,
556 wake, delete);
557 prealloc = NULL;
558 goto out;
559 }
560
561 start = state->end + 1;
562 set |= clear_state_bit(tree, state, bits, wake, delete);
563 goto search_again;
564
565out:
566 spin_unlock(&tree->lock);
567 if (prealloc)
568 free_extent_state(prealloc);
569
570 return set;
571
572search_again:
573 if (start > end)
574 goto out;
575 spin_unlock(&tree->lock);
576 if (mask & __GFP_WAIT)
577 cond_resched();
578 goto again;
579}
580
581static int wait_on_state(struct extent_io_tree *tree,
582 struct extent_state *state)
583 __releases(tree->lock)
584 __acquires(tree->lock)
585{
586 DEFINE_WAIT(wait);
587 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
588 spin_unlock(&tree->lock);
589 schedule();
590 spin_lock(&tree->lock);
591 finish_wait(&state->wq, &wait);
592 return 0;
593}
594
595/*
596 * waits for one or more bits to clear on a range in the state tree.
597 * The range [start, end] is inclusive.
598 * The tree lock is taken by this function
599 */
600int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
601{
602 struct extent_state *state;
603 struct rb_node *node;
604
605 spin_lock(&tree->lock);
606again:
607 while (1) {
608 /*
609 * this search will find all the extents that end after
610 * our range starts
611 */
612 node = tree_search(tree, start);
613 if (!node)
614 break;
615
616 state = rb_entry(node, struct extent_state, rb_node);
617
618 if (state->start > end)
619 goto out;
620
621 if (state->state & bits) {
622 start = state->start;
623 atomic_inc(&state->refs);
624 wait_on_state(tree, state);
625 free_extent_state(state);
626 goto again;
627 }
628 start = state->end + 1;
629
630 if (start > end)
631 break;
632
633 if (need_resched()) {
634 spin_unlock(&tree->lock);
635 cond_resched();
636 spin_lock(&tree->lock);
637 }
638 }
639out:
640 spin_unlock(&tree->lock);
641 return 0;
642}
643
644static void set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state,
646 int bits)
647{
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range;
651 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits;
654}
655
656/*
657 * set some bits on a range in the tree. This may require allocations
658 * or sleeping, so the gfp mask is used to indicate what is allowed.
659 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
661 * range already has the desired bits set. The start of the existing
662 * range is returned in failed_start in this case.
663 *
664 * [start, end] is inclusive
665 * This takes the tree lock.
666 */
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start,
669 gfp_t mask)
670{
671 struct extent_state *state;
672 struct extent_state *prealloc = NULL;
673 struct rb_node *node;
674 int err = 0;
675 int set;
676 u64 last_start;
677 u64 last_end;
678again:
679 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask);
681 if (!prealloc)
682 return -ENOMEM;
683 }
684
685 spin_lock(&tree->lock);
686 /*
687 * this search will find all the extents that end after
688 * our range starts.
689 */
690 node = tree_search(tree, start);
691 if (!node) {
692 err = insert_state(tree, prealloc, start, end, bits);
693 prealloc = NULL;
694 BUG_ON(err == -EEXIST);
695 goto out;
696 }
697
698 state = rb_entry(node, struct extent_state, rb_node);
699 last_start = state->start;
700 last_end = state->end;
701
702 /*
703 * | ---- desired range ---- |
704 * | state |
705 *
706 * Just lock what we found and keep going
707 */
708 if (state->start == start && state->end <= end) {
709 set = state->state & bits;
710 if (set && exclusive) {
711 *failed_start = state->start;
712 err = -EEXIST;
713 goto out;
714 }
715 set_state_bits(tree, state, bits);
716 start = state->end + 1;
717 merge_state(tree, state);
718 goto search_again;
719 }
720
721 /*
722 * | ---- desired range ---- |
723 * | state |
724 * or
725 * | ------------- state -------------- |
726 *
727 * We need to split the extent we found, and may flip bits on
728 * second half.
729 *
730 * If the extent we found extends past our
731 * range, we just split and search again. It'll get split
732 * again the next time though.
733 *
734 * If the extent we found is inside our range, we set the
735 * desired bit on it.
736 */
737 if (state->start < start) {
738 set = state->state & bits;
739 if (exclusive && set) {
740 *failed_start = start;
741 err = -EEXIST;
742 goto out;
743 }
744 err = split_state(tree, state, prealloc, start);
745 BUG_ON(err == -EEXIST);
746 prealloc = NULL;
747 if (err)
748 goto out;
749 if (state->end <= end) {
750 set_state_bits(tree, state, bits);
751 start = state->end + 1;
752 merge_state(tree, state);
753 } else {
754 start = state->start;
755 }
756 goto search_again;
757 }
758 /*
759 * | ---- desired range ---- |
760 * | state | or | state |
761 *
762 * There's a hole, we need to insert something in it and
763 * ignore the extent we found.
764 */
765 if (state->start > start) {
766 u64 this_end;
767 if (end < last_start)
768 this_end = end;
769 else
770 this_end = last_start - 1;
771 err = insert_state(tree, prealloc, start, this_end,
772 bits);
773 prealloc = NULL;
774 BUG_ON(err == -EEXIST);
775 if (err)
776 goto out;
777 start = this_end + 1;
778 goto search_again;
779 }
780 /*
781 * | ---- desired range ---- |
782 * | state |
783 * We need to split the extent, and set the bit
784 * on the first half
785 */
786 if (state->start <= end && state->end > end) {
787 set = state->state & bits;
788 if (exclusive && set) {
789 *failed_start = start;
790 err = -EEXIST;
791 goto out;
792 }
793 err = split_state(tree, state, prealloc, end + 1);
794 BUG_ON(err == -EEXIST);
795
796 set_state_bits(tree, prealloc, bits);
797 merge_state(tree, prealloc);
798 prealloc = NULL;
799 goto out;
800 }
801
802 goto search_again;
803
804out:
805 spin_unlock(&tree->lock);
806 if (prealloc)
807 free_extent_state(prealloc);
808
809 return err;
810
811search_again:
812 if (start > end)
813 goto out;
814 spin_unlock(&tree->lock);
815 if (mask & __GFP_WAIT)
816 cond_resched();
817 goto again;
818}
819
820/* wrappers around set/clear extent bit */
821int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
822 gfp_t mask)
823{
824 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
825 mask);
826}
827
828int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
829 gfp_t mask)
830{
831 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
832}
833
834int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
835 int bits, gfp_t mask)
836{
837 return set_extent_bit(tree, start, end, bits, 0, NULL,
838 mask);
839}
840
841int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
842 int bits, gfp_t mask)
843{
844 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
845}
846
847int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
848 gfp_t mask)
849{
850 return set_extent_bit(tree, start, end,
851 EXTENT_DELALLOC | EXTENT_DIRTY,
852 0, NULL, mask);
853}
854
855int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
856 gfp_t mask)
857{
858 return clear_extent_bit(tree, start, end,
859 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
860}
861
862int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
863 gfp_t mask)
864{
865 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
866}
867
868int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
869 gfp_t mask)
870{
871 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
872 mask);
873}
874
875static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
876 gfp_t mask)
877{
878 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
879}
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887
888static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
889 u64 end, gfp_t mask)
890{
891 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
892}
893
894static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
895 gfp_t mask)
896{
897 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
898 0, NULL, mask);
899}
900
901static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
902 u64 end, gfp_t mask)
903{
904 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
905}
906
907int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
908{
909 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
910}
911
912/*
913 * either insert or lock state struct between start and end use mask to tell
914 * us if waiting is desired.
915 */
916int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
917{
918 int err;
919 u64 failed_start;
920 while (1) {
921 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
922 &failed_start, mask);
923 if (err == -EEXIST && (mask & __GFP_WAIT)) {
924 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
925 start = failed_start;
926 } else {
927 break;
928 }
929 WARN_ON(start > end);
930 }
931 return err;
932}
933
934int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
935 gfp_t mask)
936{
937 int err;
938 u64 failed_start;
939
940 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
941 &failed_start, mask);
942 if (err == -EEXIST) {
943 if (failed_start > start)
944 clear_extent_bit(tree, start, failed_start - 1,
945 EXTENT_LOCKED, 1, 0, mask);
946 return 0;
947 }
948 return 1;
949}
950
951int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
952 gfp_t mask)
953{
954 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
955}
956
957/*
958 * helper function to set pages and extents in the tree dirty
959 */
960int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
961{
962 unsigned long index = start >> PAGE_CACHE_SHIFT;
963 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
964 struct page *page;
965
966 while (index <= end_index) {
967 page = find_get_page(tree->mapping, index);
968 BUG_ON(!page);
969 __set_page_dirty_nobuffers(page);
970 page_cache_release(page);
971 index++;
972 }
973 set_extent_dirty(tree, start, end, GFP_NOFS);
974 return 0;
975}
976
977/*
978 * helper function to set both pages and extents in the tree writeback
979 */
980static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
981{
982 unsigned long index = start >> PAGE_CACHE_SHIFT;
983 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
984 struct page *page;
985
986 while (index <= end_index) {
987 page = find_get_page(tree->mapping, index);
988 BUG_ON(!page);
989 set_page_writeback(page);
990 page_cache_release(page);
991 index++;
992 }
993 set_extent_writeback(tree, start, end, GFP_NOFS);
994 return 0;
995}
996
997/*
998 * find the first offset in the io tree with 'bits' set. zero is
999 * returned if we find something, and *start_ret and *end_ret are
1000 * set to reflect the state struct that was found.
1001 *
1002 * If nothing was found, 1 is returned, < 0 on error
1003 */
1004int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1005 u64 *start_ret, u64 *end_ret, int bits)
1006{
1007 struct rb_node *node;
1008 struct extent_state *state;
1009 int ret = 1;
1010
1011 spin_lock(&tree->lock);
1012 /*
1013 * this search will find all the extents that end after
1014 * our range starts.
1015 */
1016 node = tree_search(tree, start);
1017 if (!node)
1018 goto out;
1019
1020 while (1) {
1021 state = rb_entry(node, struct extent_state, rb_node);
1022 if (state->end >= start && (state->state & bits)) {
1023 *start_ret = state->start;
1024 *end_ret = state->end;
1025 ret = 0;
1026 break;
1027 }
1028 node = rb_next(node);
1029 if (!node)
1030 break;
1031 }
1032out:
1033 spin_unlock(&tree->lock);
1034 return ret;
1035}
1036
1037/* find the first state struct with 'bits' set after 'start', and
1038 * return it. tree->lock must be held. NULL will returned if
1039 * nothing was found after 'start'
1040 */
1041struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1042 u64 start, int bits)
1043{
1044 struct rb_node *node;
1045 struct extent_state *state;
1046
1047 /*
1048 * this search will find all the extents that end after
1049 * our range starts.
1050 */
1051 node = tree_search(tree, start);
1052 if (!node)
1053 goto out;
1054
1055 while (1) {
1056 state = rb_entry(node, struct extent_state, rb_node);
1057 if (state->end >= start && (state->state & bits))
1058 return state;
1059
1060 node = rb_next(node);
1061 if (!node)
1062 break;
1063 }
1064out:
1065 return NULL;
1066}
1067
1068/*
1069 * find a contiguous range of bytes in the file marked as delalloc, not
1070 * more than 'max_bytes'. start and end are used to return the range,
1071 *
1072 * 1 is returned if we find something, 0 if nothing was in the tree
1073 */
1074static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1075 u64 *start, u64 *end, u64 max_bytes)
1076{
1077 struct rb_node *node;
1078 struct extent_state *state;
1079 u64 cur_start = *start;
1080 u64 found = 0;
1081 u64 total_bytes = 0;
1082
1083 spin_lock(&tree->lock);
1084
1085 /*
1086 * this search will find all the extents that end after
1087 * our range starts.
1088 */
1089 node = tree_search(tree, cur_start);
1090 if (!node) {
1091 if (!found)
1092 *end = (u64)-1;
1093 goto out;
1094 }
1095
1096 while (1) {
1097 state = rb_entry(node, struct extent_state, rb_node);
1098 if (found && (state->start != cur_start ||
1099 (state->state & EXTENT_BOUNDARY))) {
1100 goto out;
1101 }
1102 if (!(state->state & EXTENT_DELALLOC)) {
1103 if (!found)
1104 *end = state->end;
1105 goto out;
1106 }
1107 if (!found)
1108 *start = state->start;
1109 found++;
1110 *end = state->end;
1111 cur_start = state->end + 1;
1112 node = rb_next(node);
1113 if (!node)
1114 break;
1115 total_bytes += state->end - state->start + 1;
1116 if (total_bytes >= max_bytes)
1117 break;
1118 }
1119out:
1120 spin_unlock(&tree->lock);
1121 return found;
1122}
1123
1124static noinline int __unlock_for_delalloc(struct inode *inode,
1125 struct page *locked_page,
1126 u64 start, u64 end)
1127{
1128 int ret;
1129 struct page *pages[16];
1130 unsigned long index = start >> PAGE_CACHE_SHIFT;
1131 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1132 unsigned long nr_pages = end_index - index + 1;
1133 int i;
1134
1135 if (index == locked_page->index && end_index == index)
1136 return 0;
1137
1138 while (nr_pages > 0) {
1139 ret = find_get_pages_contig(inode->i_mapping, index,
1140 min_t(unsigned long, nr_pages,
1141 ARRAY_SIZE(pages)), pages);
1142 for (i = 0; i < ret; i++) {
1143 if (pages[i] != locked_page)
1144 unlock_page(pages[i]);
1145 page_cache_release(pages[i]);
1146 }
1147 nr_pages -= ret;
1148 index += ret;
1149 cond_resched();
1150 }
1151 return 0;
1152}
1153
1154static noinline int lock_delalloc_pages(struct inode *inode,
1155 struct page *locked_page,
1156 u64 delalloc_start,
1157 u64 delalloc_end)
1158{
1159 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1160 unsigned long start_index = index;
1161 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1162 unsigned long pages_locked = 0;
1163 struct page *pages[16];
1164 unsigned long nrpages;
1165 int ret;
1166 int i;
1167
1168 /* the caller is responsible for locking the start index */
1169 if (index == locked_page->index && index == end_index)
1170 return 0;
1171
1172 /* skip the page at the start index */
1173 nrpages = end_index - index + 1;
1174 while (nrpages > 0) {
1175 ret = find_get_pages_contig(inode->i_mapping, index,
1176 min_t(unsigned long,
1177 nrpages, ARRAY_SIZE(pages)), pages);
1178 if (ret == 0) {
1179 ret = -EAGAIN;
1180 goto done;
1181 }
1182 /* now we have an array of pages, lock them all */
1183 for (i = 0; i < ret; i++) {
1184 /*
1185 * the caller is taking responsibility for
1186 * locked_page
1187 */
1188 if (pages[i] != locked_page) {
1189 lock_page(pages[i]);
1190 if (!PageDirty(pages[i]) ||
1191 pages[i]->mapping != inode->i_mapping) {
1192 ret = -EAGAIN;
1193 unlock_page(pages[i]);
1194 page_cache_release(pages[i]);
1195 goto done;
1196 }
1197 }
1198 page_cache_release(pages[i]);
1199 pages_locked++;
1200 }
1201 nrpages -= ret;
1202 index += ret;
1203 cond_resched();
1204 }
1205 ret = 0;
1206done:
1207 if (ret && pages_locked) {
1208 __unlock_for_delalloc(inode, locked_page,
1209 delalloc_start,
1210 ((u64)(start_index + pages_locked - 1)) <<
1211 PAGE_CACHE_SHIFT);
1212 }
1213 return ret;
1214}
1215
1216/*
1217 * find a contiguous range of bytes in the file marked as delalloc, not
1218 * more than 'max_bytes'. start and end are used to return the range,
1219 *
1220 * 1 is returned if we find something, 0 if nothing was in the tree
1221 */
1222static noinline u64 find_lock_delalloc_range(struct inode *inode,
1223 struct extent_io_tree *tree,
1224 struct page *locked_page,
1225 u64 *start, u64 *end,
1226 u64 max_bytes)
1227{
1228 u64 delalloc_start;
1229 u64 delalloc_end;
1230 u64 found;
1231 int ret;
1232 int loops = 0;
1233
1234again:
1235 /* step one, find a bunch of delalloc bytes starting at start */
1236 delalloc_start = *start;
1237 delalloc_end = 0;
1238 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1239 max_bytes);
1240 if (!found || delalloc_end <= *start) {
1241 *start = delalloc_start;
1242 *end = delalloc_end;
1243 return found;
1244 }
1245
1246 /*
1247 * start comes from the offset of locked_page. We have to lock
1248 * pages in order, so we can't process delalloc bytes before
1249 * locked_page
1250 */
1251 if (delalloc_start < *start)
1252 delalloc_start = *start;
1253
1254 /*
1255 * make sure to limit the number of pages we try to lock down
1256 * if we're looping.
1257 */
1258 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1259 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1260
1261 /* step two, lock all the pages after the page that has start */
1262 ret = lock_delalloc_pages(inode, locked_page,
1263 delalloc_start, delalloc_end);
1264 if (ret == -EAGAIN) {
1265 /* some of the pages are gone, lets avoid looping by
1266 * shortening the size of the delalloc range we're searching
1267 */
1268 if (!loops) {
1269 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1270 max_bytes = PAGE_CACHE_SIZE - offset;
1271 loops = 1;
1272 goto again;
1273 } else {
1274 found = 0;
1275 goto out_failed;
1276 }
1277 }
1278 BUG_ON(ret);
1279
1280 /* step three, lock the state bits for the whole range */
1281 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282
1283 /* then test to make sure it is all still delalloc */
1284 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1285 EXTENT_DELALLOC, 1);
1286 if (!ret) {
1287 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1288 __unlock_for_delalloc(inode, locked_page,
1289 delalloc_start, delalloc_end);
1290 cond_resched();
1291 goto again;
1292 }
1293 *start = delalloc_start;
1294 *end = delalloc_end;
1295out_failed:
1296 return found;
1297}
1298
1299int extent_clear_unlock_delalloc(struct inode *inode,
1300 struct extent_io_tree *tree,
1301 u64 start, u64 end, struct page *locked_page,
1302 int unlock_pages,
1303 int clear_unlock,
1304 int clear_delalloc, int clear_dirty,
1305 int set_writeback,
1306 int end_writeback)
1307{
1308 int ret;
1309 struct page *pages[16];
1310 unsigned long index = start >> PAGE_CACHE_SHIFT;
1311 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1312 unsigned long nr_pages = end_index - index + 1;
1313 int i;
1314 int clear_bits = 0;
1315
1316 if (clear_unlock)
1317 clear_bits |= EXTENT_LOCKED;
1318 if (clear_dirty)
1319 clear_bits |= EXTENT_DIRTY;
1320
1321 if (clear_delalloc)
1322 clear_bits |= EXTENT_DELALLOC;
1323
1324 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1325 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1326 return 0;
1327
1328 while (nr_pages > 0) {
1329 ret = find_get_pages_contig(inode->i_mapping, index,
1330 min_t(unsigned long,
1331 nr_pages, ARRAY_SIZE(pages)), pages);
1332 for (i = 0; i < ret; i++) {
1333 if (pages[i] == locked_page) {
1334 page_cache_release(pages[i]);
1335 continue;
1336 }
1337 if (clear_dirty)
1338 clear_page_dirty_for_io(pages[i]);
1339 if (set_writeback)
1340 set_page_writeback(pages[i]);
1341 if (end_writeback)
1342 end_page_writeback(pages[i]);
1343 if (unlock_pages)
1344 unlock_page(pages[i]);
1345 page_cache_release(pages[i]);
1346 }
1347 nr_pages -= ret;
1348 index += ret;
1349 cond_resched();
1350 }
1351 return 0;
1352}
1353
1354/*
1355 * count the number of bytes in the tree that have a given bit(s)
1356 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1357 * cached. The total number found is returned.
1358 */
1359u64 count_range_bits(struct extent_io_tree *tree,
1360 u64 *start, u64 search_end, u64 max_bytes,
1361 unsigned long bits)
1362{
1363 struct rb_node *node;
1364 struct extent_state *state;
1365 u64 cur_start = *start;
1366 u64 total_bytes = 0;
1367 int found = 0;
1368
1369 if (search_end <= cur_start) {
1370 WARN_ON(1);
1371 return 0;
1372 }
1373
1374 spin_lock(&tree->lock);
1375 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1376 total_bytes = tree->dirty_bytes;
1377 goto out;
1378 }
1379 /*
1380 * this search will find all the extents that end after
1381 * our range starts.
1382 */
1383 node = tree_search(tree, cur_start);
1384 if (!node)
1385 goto out;
1386
1387 while (1) {
1388 state = rb_entry(node, struct extent_state, rb_node);
1389 if (state->start > search_end)
1390 break;
1391 if (state->end >= cur_start && (state->state & bits)) {
1392 total_bytes += min(search_end, state->end) + 1 -
1393 max(cur_start, state->start);
1394 if (total_bytes >= max_bytes)
1395 break;
1396 if (!found) {
1397 *start = state->start;
1398 found = 1;
1399 }
1400 }
1401 node = rb_next(node);
1402 if (!node)
1403 break;
1404 }
1405out:
1406 spin_unlock(&tree->lock);
1407 return total_bytes;
1408}
1409
1410#if 0
1411/*
1412 * helper function to lock both pages and extents in the tree.
1413 * pages must be locked first.
1414 */
1415static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1416{
1417 unsigned long index = start >> PAGE_CACHE_SHIFT;
1418 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1419 struct page *page;
1420 int err;
1421
1422 while (index <= end_index) {
1423 page = grab_cache_page(tree->mapping, index);
1424 if (!page) {
1425 err = -ENOMEM;
1426 goto failed;
1427 }
1428 if (IS_ERR(page)) {
1429 err = PTR_ERR(page);
1430 goto failed;
1431 }
1432 index++;
1433 }
1434 lock_extent(tree, start, end, GFP_NOFS);
1435 return 0;
1436
1437failed:
1438 /*
1439 * we failed above in getting the page at 'index', so we undo here
1440 * up to but not including the page at 'index'
1441 */
1442 end_index = index;
1443 index = start >> PAGE_CACHE_SHIFT;
1444 while (index < end_index) {
1445 page = find_get_page(tree->mapping, index);
1446 unlock_page(page);
1447 page_cache_release(page);
1448 index++;
1449 }
1450 return err;
1451}
1452
1453/*
1454 * helper function to unlock both pages and extents in the tree.
1455 */
1456static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1457{
1458 unsigned long index = start >> PAGE_CACHE_SHIFT;
1459 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1460 struct page *page;
1461
1462 while (index <= end_index) {
1463 page = find_get_page(tree->mapping, index);
1464 unlock_page(page);
1465 page_cache_release(page);
1466 index++;
1467 }
1468 unlock_extent(tree, start, end, GFP_NOFS);
1469 return 0;
1470}
1471#endif
1472
1473/*
1474 * set the private field for a given byte offset in the tree. If there isn't
1475 * an extent_state there already, this does nothing.
1476 */
1477int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1478{
1479 struct rb_node *node;
1480 struct extent_state *state;
1481 int ret = 0;
1482
1483 spin_lock(&tree->lock);
1484 /*
1485 * this search will find all the extents that end after
1486 * our range starts.
1487 */
1488 node = tree_search(tree, start);
1489 if (!node) {
1490 ret = -ENOENT;
1491 goto out;
1492 }
1493 state = rb_entry(node, struct extent_state, rb_node);
1494 if (state->start != start) {
1495 ret = -ENOENT;
1496 goto out;
1497 }
1498 state->private = private;
1499out:
1500 spin_unlock(&tree->lock);
1501 return ret;
1502}
1503
1504int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1505{
1506 struct rb_node *node;
1507 struct extent_state *state;
1508 int ret = 0;
1509
1510 spin_lock(&tree->lock);
1511 /*
1512 * this search will find all the extents that end after
1513 * our range starts.
1514 */
1515 node = tree_search(tree, start);
1516 if (!node) {
1517 ret = -ENOENT;
1518 goto out;
1519 }
1520 state = rb_entry(node, struct extent_state, rb_node);
1521 if (state->start != start) {
1522 ret = -ENOENT;
1523 goto out;
1524 }
1525 *private = state->private;
1526out:
1527 spin_unlock(&tree->lock);
1528 return ret;
1529}
1530
1531/*
1532 * searches a range in the state tree for a given mask.
1533 * If 'filled' == 1, this returns 1 only if every extent in the tree
1534 * has the bits set. Otherwise, 1 is returned if any bit in the
1535 * range is found set.
1536 */
1537int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1538 int bits, int filled)
1539{
1540 struct extent_state *state = NULL;
1541 struct rb_node *node;
1542 int bitset = 0;
1543
1544 spin_lock(&tree->lock);
1545 node = tree_search(tree, start);
1546 while (node && start <= end) {
1547 state = rb_entry(node, struct extent_state, rb_node);
1548
1549 if (filled && state->start > start) {
1550 bitset = 0;
1551 break;
1552 }
1553
1554 if (state->start > end)
1555 break;
1556
1557 if (state->state & bits) {
1558 bitset = 1;
1559 if (!filled)
1560 break;
1561 } else if (filled) {
1562 bitset = 0;
1563 break;
1564 }
1565 start = state->end + 1;
1566 if (start > end)
1567 break;
1568 node = rb_next(node);
1569 if (!node) {
1570 if (filled)
1571 bitset = 0;
1572 break;
1573 }
1574 }
1575 spin_unlock(&tree->lock);
1576 return bitset;
1577}
1578
1579/*
1580 * helper function to set a given page up to date if all the
1581 * extents in the tree for that page are up to date
1582 */
1583static int check_page_uptodate(struct extent_io_tree *tree,
1584 struct page *page)
1585{
1586 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1587 u64 end = start + PAGE_CACHE_SIZE - 1;
1588 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1589 SetPageUptodate(page);
1590 return 0;
1591}
1592
1593/*
1594 * helper function to unlock a page if all the extents in the tree
1595 * for that page are unlocked
1596 */
1597static int check_page_locked(struct extent_io_tree *tree,
1598 struct page *page)
1599{
1600 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1601 u64 end = start + PAGE_CACHE_SIZE - 1;
1602 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1603 unlock_page(page);
1604 return 0;
1605}
1606
1607/*
1608 * helper function to end page writeback if all the extents
1609 * in the tree for that page are done with writeback
1610 */
1611static int check_page_writeback(struct extent_io_tree *tree,
1612 struct page *page)
1613{
1614 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1615 u64 end = start + PAGE_CACHE_SIZE - 1;
1616 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1617 end_page_writeback(page);
1618 return 0;
1619}
1620
1621/* lots and lots of room for performance fixes in the end_bio funcs */
1622
1623/*
1624 * after a writepage IO is done, we need to:
1625 * clear the uptodate bits on error
1626 * clear the writeback bits in the extent tree for this IO
1627 * end_page_writeback if the page has no more pending IO
1628 *
1629 * Scheduling is not allowed, so the extent state tree is expected
1630 * to have one and only one object corresponding to this IO.
1631 */
1632static void end_bio_extent_writepage(struct bio *bio, int err)
1633{
1634 int uptodate = err == 0;
1635 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1636 struct extent_io_tree *tree;
1637 u64 start;
1638 u64 end;
1639 int whole_page;
1640 int ret;
1641
1642 do {
1643 struct page *page = bvec->bv_page;
1644 tree = &BTRFS_I(page->mapping->host)->io_tree;
1645
1646 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1647 bvec->bv_offset;
1648 end = start + bvec->bv_len - 1;
1649
1650 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1651 whole_page = 1;
1652 else
1653 whole_page = 0;
1654
1655 if (--bvec >= bio->bi_io_vec)
1656 prefetchw(&bvec->bv_page->flags);
1657 if (tree->ops && tree->ops->writepage_end_io_hook) {
1658 ret = tree->ops->writepage_end_io_hook(page, start,
1659 end, NULL, uptodate);
1660 if (ret)
1661 uptodate = 0;
1662 }
1663
1664 if (!uptodate && tree->ops &&
1665 tree->ops->writepage_io_failed_hook) {
1666 ret = tree->ops->writepage_io_failed_hook(bio, page,
1667 start, end, NULL);
1668 if (ret == 0) {
1669 uptodate = (err == 0);
1670 continue;
1671 }
1672 }
1673
1674 if (!uptodate) {
1675 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1676 ClearPageUptodate(page);
1677 SetPageError(page);
1678 }
1679
1680 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1681
1682 if (whole_page)
1683 end_page_writeback(page);
1684 else
1685 check_page_writeback(tree, page);
1686 } while (bvec >= bio->bi_io_vec);
1687
1688 bio_put(bio);
1689}
1690
1691/*
1692 * after a readpage IO is done, we need to:
1693 * clear the uptodate bits on error
1694 * set the uptodate bits if things worked
1695 * set the page up to date if all extents in the tree are uptodate
1696 * clear the lock bit in the extent tree
1697 * unlock the page if there are no other extents locked for it
1698 *
1699 * Scheduling is not allowed, so the extent state tree is expected
1700 * to have one and only one object corresponding to this IO.
1701 */
1702static void end_bio_extent_readpage(struct bio *bio, int err)
1703{
1704 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1705 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1706 struct extent_io_tree *tree;
1707 u64 start;
1708 u64 end;
1709 int whole_page;
1710 int ret;
1711
1712 if (err)
1713 uptodate = 0;
1714
1715 do {
1716 struct page *page = bvec->bv_page;
1717 tree = &BTRFS_I(page->mapping->host)->io_tree;
1718
1719 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1720 bvec->bv_offset;
1721 end = start + bvec->bv_len - 1;
1722
1723 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1724 whole_page = 1;
1725 else
1726 whole_page = 0;
1727
1728 if (--bvec >= bio->bi_io_vec)
1729 prefetchw(&bvec->bv_page->flags);
1730
1731 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1732 ret = tree->ops->readpage_end_io_hook(page, start, end,
1733 NULL);
1734 if (ret)
1735 uptodate = 0;
1736 }
1737 if (!uptodate && tree->ops &&
1738 tree->ops->readpage_io_failed_hook) {
1739 ret = tree->ops->readpage_io_failed_hook(bio, page,
1740 start, end, NULL);
1741 if (ret == 0) {
1742 uptodate =
1743 test_bit(BIO_UPTODATE, &bio->bi_flags);
1744 if (err)
1745 uptodate = 0;
1746 continue;
1747 }
1748 }
1749
1750 if (uptodate) {
1751 set_extent_uptodate(tree, start, end,
1752 GFP_ATOMIC);
1753 }
1754 unlock_extent(tree, start, end, GFP_ATOMIC);
1755
1756 if (whole_page) {
1757 if (uptodate) {
1758 SetPageUptodate(page);
1759 } else {
1760 ClearPageUptodate(page);
1761 SetPageError(page);
1762 }
1763 unlock_page(page);
1764 } else {
1765 if (uptodate) {
1766 check_page_uptodate(tree, page);
1767 } else {
1768 ClearPageUptodate(page);
1769 SetPageError(page);
1770 }
1771 check_page_locked(tree, page);
1772 }
1773 } while (bvec >= bio->bi_io_vec);
1774
1775 bio_put(bio);
1776}
1777
1778/*
1779 * IO done from prepare_write is pretty simple, we just unlock
1780 * the structs in the extent tree when done, and set the uptodate bits
1781 * as appropriate.
1782 */
1783static void end_bio_extent_preparewrite(struct bio *bio, int err)
1784{
1785 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1786 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1787 struct extent_io_tree *tree;
1788 u64 start;
1789 u64 end;
1790
1791 do {
1792 struct page *page = bvec->bv_page;
1793 tree = &BTRFS_I(page->mapping->host)->io_tree;
1794
1795 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1796 bvec->bv_offset;
1797 end = start + bvec->bv_len - 1;
1798
1799 if (--bvec >= bio->bi_io_vec)
1800 prefetchw(&bvec->bv_page->flags);
1801
1802 if (uptodate) {
1803 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1804 } else {
1805 ClearPageUptodate(page);
1806 SetPageError(page);
1807 }
1808
1809 unlock_extent(tree, start, end, GFP_ATOMIC);
1810
1811 } while (bvec >= bio->bi_io_vec);
1812
1813 bio_put(bio);
1814}
1815
1816static struct bio *
1817extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1818 gfp_t gfp_flags)
1819{
1820 struct bio *bio;
1821
1822 bio = bio_alloc(gfp_flags, nr_vecs);
1823
1824 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1825 while (!bio && (nr_vecs /= 2))
1826 bio = bio_alloc(gfp_flags, nr_vecs);
1827 }
1828
1829 if (bio) {
1830 bio->bi_size = 0;
1831 bio->bi_bdev = bdev;
1832 bio->bi_sector = first_sector;
1833 }
1834 return bio;
1835}
1836
1837static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1838 unsigned long bio_flags)
1839{
1840 int ret = 0;
1841 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1842 struct page *page = bvec->bv_page;
1843 struct extent_io_tree *tree = bio->bi_private;
1844 u64 start;
1845 u64 end;
1846
1847 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1848 end = start + bvec->bv_len - 1;
1849
1850 bio->bi_private = NULL;
1851
1852 bio_get(bio);
1853
1854 if (tree->ops && tree->ops->submit_bio_hook)
1855 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856 mirror_num, bio_flags);
1857 else
1858 submit_bio(rw, bio);
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP;
1861 bio_put(bio);
1862 return ret;
1863}
1864
1865static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866 struct page *page, sector_t sector,
1867 size_t size, unsigned long offset,
1868 struct block_device *bdev,
1869 struct bio **bio_ret,
1870 unsigned long max_pages,
1871 bio_end_io_t end_io_func,
1872 int mirror_num,
1873 unsigned long prev_bio_flags,
1874 unsigned long bio_flags)
1875{
1876 int ret = 0;
1877 struct bio *bio;
1878 int nr;
1879 int contig = 0;
1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883
1884 if (bio_ret && *bio_ret) {
1885 bio = *bio_ret;
1886 if (old_compressed)
1887 contig = bio->bi_sector == sector;
1888 else
1889 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890 sector;
1891
1892 if (prev_bio_flags != bio_flags || !contig ||
1893 (tree->ops && tree->ops->merge_bio_hook &&
1894 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895 bio_flags)) ||
1896 bio_add_page(bio, page, page_size, offset) < page_size) {
1897 ret = submit_one_bio(rw, bio, mirror_num,
1898 prev_bio_flags);
1899 bio = NULL;
1900 } else {
1901 return 0;
1902 }
1903 }
1904 if (this_compressed)
1905 nr = BIO_MAX_PAGES;
1906 else
1907 nr = bio_get_nr_vecs(bdev);
1908
1909 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910
1911 bio_add_page(bio, page, page_size, offset);
1912 bio->bi_end_io = end_io_func;
1913 bio->bi_private = tree;
1914
1915 if (bio_ret)
1916 *bio_ret = bio;
1917 else
1918 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1919
1920 return ret;
1921}
1922
1923void set_page_extent_mapped(struct page *page)
1924{
1925 if (!PagePrivate(page)) {
1926 SetPagePrivate(page);
1927 page_cache_get(page);
1928 set_page_private(page, EXTENT_PAGE_PRIVATE);
1929 }
1930}
1931
1932static void set_page_extent_head(struct page *page, unsigned long len)
1933{
1934 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935}
1936
1937/*
1938 * basic readpage implementation. Locked extent state structs are inserted
1939 * into the tree that are removed when the IO is done (by the end_io
1940 * handlers)
1941 */
1942static int __extent_read_full_page(struct extent_io_tree *tree,
1943 struct page *page,
1944 get_extent_t *get_extent,
1945 struct bio **bio, int mirror_num,
1946 unsigned long *bio_flags)
1947{
1948 struct inode *inode = page->mapping->host;
1949 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951 u64 end;
1952 u64 cur = start;
1953 u64 extent_offset;
1954 u64 last_byte = i_size_read(inode);
1955 u64 block_start;
1956 u64 cur_end;
1957 sector_t sector;
1958 struct extent_map *em;
1959 struct block_device *bdev;
1960 int ret;
1961 int nr = 0;
1962 size_t page_offset = 0;
1963 size_t iosize;
1964 size_t disk_io_size;
1965 size_t blocksize = inode->i_sb->s_blocksize;
1966 unsigned long this_bio_flag = 0;
1967
1968 set_page_extent_mapped(page);
1969
1970 end = page_end;
1971 lock_extent(tree, start, end, GFP_NOFS);
1972
1973 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1974 char *userpage;
1975 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1976
1977 if (zero_offset) {
1978 iosize = PAGE_CACHE_SIZE - zero_offset;
1979 userpage = kmap_atomic(page, KM_USER0);
1980 memset(userpage + zero_offset, 0, iosize);
1981 flush_dcache_page(page);
1982 kunmap_atomic(userpage, KM_USER0);
1983 }
1984 }
1985 while (cur <= end) {
1986 if (cur >= last_byte) {
1987 char *userpage;
1988 iosize = PAGE_CACHE_SIZE - page_offset;
1989 userpage = kmap_atomic(page, KM_USER0);
1990 memset(userpage + page_offset, 0, iosize);
1991 flush_dcache_page(page);
1992 kunmap_atomic(userpage, KM_USER0);
1993 set_extent_uptodate(tree, cur, cur + iosize - 1,
1994 GFP_NOFS);
1995 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1996 break;
1997 }
1998 em = get_extent(inode, page, page_offset, cur,
1999 end - cur + 1, 0);
2000 if (IS_ERR(em) || !em) {
2001 SetPageError(page);
2002 unlock_extent(tree, cur, end, GFP_NOFS);
2003 break;
2004 }
2005 extent_offset = cur - em->start;
2006 BUG_ON(extent_map_end(em) <= cur);
2007 BUG_ON(end < cur);
2008
2009 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2010 this_bio_flag = EXTENT_BIO_COMPRESSED;
2011
2012 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2013 cur_end = min(extent_map_end(em) - 1, end);
2014 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2015 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2016 disk_io_size = em->block_len;
2017 sector = em->block_start >> 9;
2018 } else {
2019 sector = (em->block_start + extent_offset) >> 9;
2020 disk_io_size = iosize;
2021 }
2022 bdev = em->bdev;
2023 block_start = em->block_start;
2024 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2025 block_start = EXTENT_MAP_HOLE;
2026 free_extent_map(em);
2027 em = NULL;
2028
2029 /* we've found a hole, just zero and go on */
2030 if (block_start == EXTENT_MAP_HOLE) {
2031 char *userpage;
2032 userpage = kmap_atomic(page, KM_USER0);
2033 memset(userpage + page_offset, 0, iosize);
2034 flush_dcache_page(page);
2035 kunmap_atomic(userpage, KM_USER0);
2036
2037 set_extent_uptodate(tree, cur, cur + iosize - 1,
2038 GFP_NOFS);
2039 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2040 cur = cur + iosize;
2041 page_offset += iosize;
2042 continue;
2043 }
2044 /* the get_extent function already copied into the page */
2045 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2046 check_page_uptodate(tree, page);
2047 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2048 cur = cur + iosize;
2049 page_offset += iosize;
2050 continue;
2051 }
2052 /* we have an inline extent but it didn't get marked up
2053 * to date. Error out
2054 */
2055 if (block_start == EXTENT_MAP_INLINE) {
2056 SetPageError(page);
2057 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2058 cur = cur + iosize;
2059 page_offset += iosize;
2060 continue;
2061 }
2062
2063 ret = 0;
2064 if (tree->ops && tree->ops->readpage_io_hook) {
2065 ret = tree->ops->readpage_io_hook(page, cur,
2066 cur + iosize - 1);
2067 }
2068 if (!ret) {
2069 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2070 pnr -= page->index;
2071 ret = submit_extent_page(READ, tree, page,
2072 sector, disk_io_size, page_offset,
2073 bdev, bio, pnr,
2074 end_bio_extent_readpage, mirror_num,
2075 *bio_flags,
2076 this_bio_flag);
2077 nr++;
2078 *bio_flags = this_bio_flag;
2079 }
2080 if (ret)
2081 SetPageError(page);
2082 cur = cur + iosize;
2083 page_offset += iosize;
2084 }
2085 if (!nr) {
2086 if (!PageError(page))
2087 SetPageUptodate(page);
2088 unlock_page(page);
2089 }
2090 return 0;
2091}
2092
2093int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2094 get_extent_t *get_extent)
2095{
2096 struct bio *bio = NULL;
2097 unsigned long bio_flags = 0;
2098 int ret;
2099
2100 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2101 &bio_flags);
2102 if (bio)
2103 submit_one_bio(READ, bio, 0, bio_flags);
2104 return ret;
2105}
2106
2107/*
2108 * the writepage semantics are similar to regular writepage. extent
2109 * records are inserted to lock ranges in the tree, and as dirty areas
2110 * are found, they are marked writeback. Then the lock bits are removed
2111 * and the end_io handler clears the writeback ranges
2112 */
2113static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2114 void *data)
2115{
2116 struct inode *inode = page->mapping->host;
2117 struct extent_page_data *epd = data;
2118 struct extent_io_tree *tree = epd->tree;
2119 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2120 u64 delalloc_start;
2121 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2122 u64 end;
2123 u64 cur = start;
2124 u64 extent_offset;
2125 u64 last_byte = i_size_read(inode);
2126 u64 block_start;
2127 u64 iosize;
2128 u64 unlock_start;
2129 sector_t sector;
2130 struct extent_map *em;
2131 struct block_device *bdev;
2132 int ret;
2133 int nr = 0;
2134 size_t pg_offset = 0;
2135 size_t blocksize;
2136 loff_t i_size = i_size_read(inode);
2137 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2138 u64 nr_delalloc;
2139 u64 delalloc_end;
2140 int page_started;
2141 int compressed;
2142 unsigned long nr_written = 0;
2143
2144 WARN_ON(!PageLocked(page));
2145 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2146 if (page->index > end_index ||
2147 (page->index == end_index && !pg_offset)) {
2148 page->mapping->a_ops->invalidatepage(page, 0);
2149 unlock_page(page);
2150 return 0;
2151 }
2152
2153 if (page->index == end_index) {
2154 char *userpage;
2155
2156 userpage = kmap_atomic(page, KM_USER0);
2157 memset(userpage + pg_offset, 0,
2158 PAGE_CACHE_SIZE - pg_offset);
2159 kunmap_atomic(userpage, KM_USER0);
2160 flush_dcache_page(page);
2161 }
2162 pg_offset = 0;
2163
2164 set_page_extent_mapped(page);
2165
2166 delalloc_start = start;
2167 delalloc_end = 0;
2168 page_started = 0;
2169 if (!epd->extent_locked) {
2170 while (delalloc_end < page_end) {
2171 nr_delalloc = find_lock_delalloc_range(inode, tree,
2172 page,
2173 &delalloc_start,
2174 &delalloc_end,
2175 128 * 1024 * 1024);
2176 if (nr_delalloc == 0) {
2177 delalloc_start = delalloc_end + 1;
2178 continue;
2179 }
2180 tree->ops->fill_delalloc(inode, page, delalloc_start,
2181 delalloc_end, &page_started,
2182 &nr_written);
2183 delalloc_start = delalloc_end + 1;
2184 }
2185
2186 /* did the fill delalloc function already unlock and start
2187 * the IO?
2188 */
2189 if (page_started) {
2190 ret = 0;
2191 goto update_nr_written;
2192 }
2193 }
2194 lock_extent(tree, start, page_end, GFP_NOFS);
2195
2196 unlock_start = start;
2197
2198 if (tree->ops && tree->ops->writepage_start_hook) {
2199 ret = tree->ops->writepage_start_hook(page, start,
2200 page_end);
2201 if (ret == -EAGAIN) {
2202 unlock_extent(tree, start, page_end, GFP_NOFS);
2203 redirty_page_for_writepage(wbc, page);
2204 unlock_page(page);
2205 ret = 0;
2206 goto update_nr_written;
2207 }
2208 }
2209
2210 nr_written++;
2211
2212 end = page_end;
2213 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2214 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2215
2216 if (last_byte <= start) {
2217 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2218 unlock_extent(tree, start, page_end, GFP_NOFS);
2219 if (tree->ops && tree->ops->writepage_end_io_hook)
2220 tree->ops->writepage_end_io_hook(page, start,
2221 page_end, NULL, 1);
2222 unlock_start = page_end + 1;
2223 goto done;
2224 }
2225
2226 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2227 blocksize = inode->i_sb->s_blocksize;
2228
2229 while (cur <= end) {
2230 if (cur >= last_byte) {
2231 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2232 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2233 if (tree->ops && tree->ops->writepage_end_io_hook)
2234 tree->ops->writepage_end_io_hook(page, cur,
2235 page_end, NULL, 1);
2236 unlock_start = page_end + 1;
2237 break;
2238 }
2239 em = epd->get_extent(inode, page, pg_offset, cur,
2240 end - cur + 1, 1);
2241 if (IS_ERR(em) || !em) {
2242 SetPageError(page);
2243 break;
2244 }
2245
2246 extent_offset = cur - em->start;
2247 BUG_ON(extent_map_end(em) <= cur);
2248 BUG_ON(end < cur);
2249 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2250 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2251 sector = (em->block_start + extent_offset) >> 9;
2252 bdev = em->bdev;
2253 block_start = em->block_start;
2254 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2255 free_extent_map(em);
2256 em = NULL;
2257
2258 /*
2259 * compressed and inline extents are written through other
2260 * paths in the FS
2261 */
2262 if (compressed || block_start == EXTENT_MAP_HOLE ||
2263 block_start == EXTENT_MAP_INLINE) {
2264 clear_extent_dirty(tree, cur,
2265 cur + iosize - 1, GFP_NOFS);
2266
2267 unlock_extent(tree, unlock_start, cur + iosize - 1,
2268 GFP_NOFS);
2269
2270 /*
2271 * end_io notification does not happen here for
2272 * compressed extents
2273 */
2274 if (!compressed && tree->ops &&
2275 tree->ops->writepage_end_io_hook)
2276 tree->ops->writepage_end_io_hook(page, cur,
2277 cur + iosize - 1,
2278 NULL, 1);
2279 else if (compressed) {
2280 /* we don't want to end_page_writeback on
2281 * a compressed extent. this happens
2282 * elsewhere
2283 */
2284 nr++;
2285 }
2286
2287 cur += iosize;
2288 pg_offset += iosize;
2289 unlock_start = cur;
2290 continue;
2291 }
2292 /* leave this out until we have a page_mkwrite call */
2293 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2294 EXTENT_DIRTY, 0)) {
2295 cur = cur + iosize;
2296 pg_offset += iosize;
2297 continue;
2298 }
2299
2300 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2301 if (tree->ops && tree->ops->writepage_io_hook) {
2302 ret = tree->ops->writepage_io_hook(page, cur,
2303 cur + iosize - 1);
2304 } else {
2305 ret = 0;
2306 }
2307 if (ret) {
2308 SetPageError(page);
2309 } else {
2310 unsigned long max_nr = end_index + 1;
2311
2312 set_range_writeback(tree, cur, cur + iosize - 1);
2313 if (!PageWriteback(page)) {
2314 printk(KERN_ERR "btrfs warning page %lu not "
2315 "writeback, cur %llu end %llu\n",
2316 page->index, (unsigned long long)cur,
2317 (unsigned long long)end);
2318 }
2319
2320 ret = submit_extent_page(WRITE, tree, page, sector,
2321 iosize, pg_offset, bdev,
2322 &epd->bio, max_nr,
2323 end_bio_extent_writepage,
2324 0, 0, 0);
2325 if (ret)
2326 SetPageError(page);
2327 }
2328 cur = cur + iosize;
2329 pg_offset += iosize;
2330 nr++;
2331 }
2332done:
2333 if (nr == 0) {
2334 /* make sure the mapping tag for page dirty gets cleared */
2335 set_page_writeback(page);
2336 end_page_writeback(page);
2337 }
2338 if (unlock_start <= page_end)
2339 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2340 unlock_page(page);
2341
2342update_nr_written:
2343 wbc->nr_to_write -= nr_written;
2344 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2345 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2346 page->mapping->writeback_index = page->index + nr_written;
2347 return 0;
2348}
2349
2350/**
2351 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2352 * @mapping: address space structure to write
2353 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2354 * @writepage: function called for each page
2355 * @data: data passed to writepage function
2356 *
2357 * If a page is already under I/O, write_cache_pages() skips it, even
2358 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2359 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2360 * and msync() need to guarantee that all the data which was dirty at the time
2361 * the call was made get new I/O started against them. If wbc->sync_mode is
2362 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2363 * existing IO to complete.
2364 */
2365static int extent_write_cache_pages(struct extent_io_tree *tree,
2366 struct address_space *mapping,
2367 struct writeback_control *wbc,
2368 writepage_t writepage, void *data,
2369 void (*flush_fn)(void *))
2370{
2371 struct backing_dev_info *bdi = mapping->backing_dev_info;
2372 int ret = 0;
2373 int done = 0;
2374 struct pagevec pvec;
2375 int nr_pages;
2376 pgoff_t index;
2377 pgoff_t end; /* Inclusive */
2378 int scanned = 0;
2379 int range_whole = 0;
2380
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */
2389 end = -1;
2390 } else {
2391 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2392 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2393 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2394 range_whole = 1;
2395 scanned = 1;
2396 }
2397retry:
2398 while (!done && (index <= end) &&
2399 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2400 PAGECACHE_TAG_DIRTY, min(end - index,
2401 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2402 unsigned i;
2403
2404 scanned = 1;
2405 for (i = 0; i < nr_pages; i++) {
2406 struct page *page = pvec.pages[i];
2407
2408 /*
2409 * At this point we hold neither mapping->tree_lock nor
2410 * lock on the page itself: the page may be truncated or
2411 * invalidated (changing page->mapping to NULL), or even
2412 * swizzled back from swapper_space to tmpfs file
2413 * mapping
2414 */
2415 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2416 tree->ops->write_cache_pages_lock_hook(page);
2417 else
2418 lock_page(page);
2419
2420 if (unlikely(page->mapping != mapping)) {
2421 unlock_page(page);
2422 continue;
2423 }
2424
2425 if (!wbc->range_cyclic && page->index > end) {
2426 done = 1;
2427 unlock_page(page);
2428 continue;
2429 }
2430
2431 if (wbc->sync_mode != WB_SYNC_NONE) {
2432 if (PageWriteback(page))
2433 flush_fn(data);
2434 wait_on_page_writeback(page);
2435 }
2436
2437 if (PageWriteback(page) ||
2438 !clear_page_dirty_for_io(page)) {
2439 unlock_page(page);
2440 continue;
2441 }
2442
2443 ret = (*writepage)(page, wbc, data);
2444
2445 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2446 unlock_page(page);
2447 ret = 0;
2448 }
2449 if (ret || wbc->nr_to_write <= 0)
2450 done = 1;
2451 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2452 wbc->encountered_congestion = 1;
2453 done = 1;
2454 }
2455 }
2456 pagevec_release(&pvec);
2457 cond_resched();
2458 }
2459 if (!scanned && !done) {
2460 /*
2461 * We hit the last page and there is more work to be done: wrap
2462 * back to the start of the file
2463 */
2464 scanned = 1;
2465 index = 0;
2466 goto retry;
2467 }
2468 return ret;
2469}
2470
2471static noinline void flush_write_bio(void *data)
2472{
2473 struct extent_page_data *epd = data;
2474 if (epd->bio) {
2475 submit_one_bio(WRITE, epd->bio, 0, 0);
2476 epd->bio = NULL;
2477 }
2478}
2479
2480int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2481 get_extent_t *get_extent,
2482 struct writeback_control *wbc)
2483{
2484 int ret;
2485 struct address_space *mapping = page->mapping;
2486 struct extent_page_data epd = {
2487 .bio = NULL,
2488 .tree = tree,
2489 .get_extent = get_extent,
2490 .extent_locked = 0,
2491 };
2492 struct writeback_control wbc_writepages = {
2493 .bdi = wbc->bdi,
2494 .sync_mode = WB_SYNC_NONE,
2495 .older_than_this = NULL,
2496 .nr_to_write = 64,
2497 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2498 .range_end = (loff_t)-1,
2499 };
2500
2501
2502 ret = __extent_writepage(page, wbc, &epd);
2503
2504 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2505 __extent_writepage, &epd, flush_write_bio);
2506 if (epd.bio)
2507 submit_one_bio(WRITE, epd.bio, 0, 0);
2508 return ret;
2509}
2510
2511int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2512 u64 start, u64 end, get_extent_t *get_extent,
2513 int mode)
2514{
2515 int ret = 0;
2516 struct address_space *mapping = inode->i_mapping;
2517 struct page *page;
2518 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2519 PAGE_CACHE_SHIFT;
2520
2521 struct extent_page_data epd = {
2522 .bio = NULL,
2523 .tree = tree,
2524 .get_extent = get_extent,
2525 .extent_locked = 1,
2526 };
2527 struct writeback_control wbc_writepages = {
2528 .bdi = inode->i_mapping->backing_dev_info,
2529 .sync_mode = mode,
2530 .older_than_this = NULL,
2531 .nr_to_write = nr_pages * 2,
2532 .range_start = start,
2533 .range_end = end + 1,
2534 };
2535
2536 while (start <= end) {
2537 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2538 if (clear_page_dirty_for_io(page))
2539 ret = __extent_writepage(page, &wbc_writepages, &epd);
2540 else {
2541 if (tree->ops && tree->ops->writepage_end_io_hook)
2542 tree->ops->writepage_end_io_hook(page, start,
2543 start + PAGE_CACHE_SIZE - 1,
2544 NULL, 1);
2545 unlock_page(page);
2546 }
2547 page_cache_release(page);
2548 start += PAGE_CACHE_SIZE;
2549 }
2550
2551 if (epd.bio)
2552 submit_one_bio(WRITE, epd.bio, 0, 0);
2553 return ret;
2554}
2555
2556int extent_writepages(struct extent_io_tree *tree,
2557 struct address_space *mapping,
2558 get_extent_t *get_extent,
2559 struct writeback_control *wbc)
2560{
2561 int ret = 0;
2562 struct extent_page_data epd = {
2563 .bio = NULL,
2564 .tree = tree,
2565 .get_extent = get_extent,
2566 .extent_locked = 0,
2567 };
2568
2569 ret = extent_write_cache_pages(tree, mapping, wbc,
2570 __extent_writepage, &epd,
2571 flush_write_bio);
2572 if (epd.bio)
2573 submit_one_bio(WRITE, epd.bio, 0, 0);
2574 return ret;
2575}
2576
2577int extent_readpages(struct extent_io_tree *tree,
2578 struct address_space *mapping,
2579 struct list_head *pages, unsigned nr_pages,
2580 get_extent_t get_extent)
2581{
2582 struct bio *bio = NULL;
2583 unsigned page_idx;
2584 struct pagevec pvec;
2585 unsigned long bio_flags = 0;
2586
2587 pagevec_init(&pvec, 0);
2588 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2589 struct page *page = list_entry(pages->prev, struct page, lru);
2590
2591 prefetchw(&page->flags);
2592 list_del(&page->lru);
2593 /*
2594 * what we want to do here is call add_to_page_cache_lru,
2595 * but that isn't exported, so we reproduce it here
2596 */
2597 if (!add_to_page_cache(page, mapping,
2598 page->index, GFP_KERNEL)) {
2599
2600 /* open coding of lru_cache_add, also not exported */
2601 page_cache_get(page);
2602 if (!pagevec_add(&pvec, page))
2603 __pagevec_lru_add_file(&pvec);
2604 __extent_read_full_page(tree, page, get_extent,
2605 &bio, 0, &bio_flags);
2606 }
2607 page_cache_release(page);
2608 }
2609 if (pagevec_count(&pvec))
2610 __pagevec_lru_add_file(&pvec);
2611 BUG_ON(!list_empty(pages));
2612 if (bio)
2613 submit_one_bio(READ, bio, 0, bio_flags);
2614 return 0;
2615}
2616
2617/*
2618 * basic invalidatepage code, this waits on any locked or writeback
2619 * ranges corresponding to the page, and then deletes any extent state
2620 * records from the tree
2621 */
2622int extent_invalidatepage(struct extent_io_tree *tree,
2623 struct page *page, unsigned long offset)
2624{
2625 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2626 u64 end = start + PAGE_CACHE_SIZE - 1;
2627 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2628
2629 start += (offset + blocksize - 1) & ~(blocksize - 1);
2630 if (start > end)
2631 return 0;
2632
2633 lock_extent(tree, start, end, GFP_NOFS);
2634 wait_on_extent_writeback(tree, start, end);
2635 clear_extent_bit(tree, start, end,
2636 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2637 1, 1, GFP_NOFS);
2638 return 0;
2639}
2640
2641/*
2642 * simple commit_write call, set_range_dirty is used to mark both
2643 * the pages and the extent records as dirty
2644 */
2645int extent_commit_write(struct extent_io_tree *tree,
2646 struct inode *inode, struct page *page,
2647 unsigned from, unsigned to)
2648{
2649 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2650
2651 set_page_extent_mapped(page);
2652 set_page_dirty(page);
2653
2654 if (pos > inode->i_size) {
2655 i_size_write(inode, pos);
2656 mark_inode_dirty(inode);
2657 }
2658 return 0;
2659}
2660
2661int extent_prepare_write(struct extent_io_tree *tree,
2662 struct inode *inode, struct page *page,
2663 unsigned from, unsigned to, get_extent_t *get_extent)
2664{
2665 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2666 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2667 u64 block_start;
2668 u64 orig_block_start;
2669 u64 block_end;
2670 u64 cur_end;
2671 struct extent_map *em;
2672 unsigned blocksize = 1 << inode->i_blkbits;
2673 size_t page_offset = 0;
2674 size_t block_off_start;
2675 size_t block_off_end;
2676 int err = 0;
2677 int iocount = 0;
2678 int ret = 0;
2679 int isnew;
2680
2681 set_page_extent_mapped(page);
2682
2683 block_start = (page_start + from) & ~((u64)blocksize - 1);
2684 block_end = (page_start + to - 1) | (blocksize - 1);
2685 orig_block_start = block_start;
2686
2687 lock_extent(tree, page_start, page_end, GFP_NOFS);
2688 while (block_start <= block_end) {
2689 em = get_extent(inode, page, page_offset, block_start,
2690 block_end - block_start + 1, 1);
2691 if (IS_ERR(em) || !em)
2692 goto err;
2693
2694 cur_end = min(block_end, extent_map_end(em) - 1);
2695 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2696 block_off_end = block_off_start + blocksize;
2697 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2698
2699 if (!PageUptodate(page) && isnew &&
2700 (block_off_end > to || block_off_start < from)) {
2701 void *kaddr;
2702
2703 kaddr = kmap_atomic(page, KM_USER0);
2704 if (block_off_end > to)
2705 memset(kaddr + to, 0, block_off_end - to);
2706 if (block_off_start < from)
2707 memset(kaddr + block_off_start, 0,
2708 from - block_off_start);
2709 flush_dcache_page(page);
2710 kunmap_atomic(kaddr, KM_USER0);
2711 }
2712 if ((em->block_start != EXTENT_MAP_HOLE &&
2713 em->block_start != EXTENT_MAP_INLINE) &&
2714 !isnew && !PageUptodate(page) &&
2715 (block_off_end > to || block_off_start < from) &&
2716 !test_range_bit(tree, block_start, cur_end,
2717 EXTENT_UPTODATE, 1)) {
2718 u64 sector;
2719 u64 extent_offset = block_start - em->start;
2720 size_t iosize;
2721 sector = (em->block_start + extent_offset) >> 9;
2722 iosize = (cur_end - block_start + blocksize) &
2723 ~((u64)blocksize - 1);
2724 /*
2725 * we've already got the extent locked, but we
2726 * need to split the state such that our end_bio
2727 * handler can clear the lock.
2728 */
2729 set_extent_bit(tree, block_start,
2730 block_start + iosize - 1,
2731 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2732 ret = submit_extent_page(READ, tree, page,
2733 sector, iosize, page_offset, em->bdev,
2734 NULL, 1,
2735 end_bio_extent_preparewrite, 0,
2736 0, 0);
2737 iocount++;
2738 block_start = block_start + iosize;
2739 } else {
2740 set_extent_uptodate(tree, block_start, cur_end,
2741 GFP_NOFS);
2742 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2743 block_start = cur_end + 1;
2744 }
2745 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2746 free_extent_map(em);
2747 }
2748 if (iocount) {
2749 wait_extent_bit(tree, orig_block_start,
2750 block_end, EXTENT_LOCKED);
2751 }
2752 check_page_uptodate(tree, page);
2753err:
2754 /* FIXME, zero out newly allocated blocks on error */
2755 return err;
2756}
2757
2758/*
2759 * a helper for releasepage, this tests for areas of the page that
2760 * are locked or under IO and drops the related state bits if it is safe
2761 * to drop the page.
2762 */
2763int try_release_extent_state(struct extent_map_tree *map,
2764 struct extent_io_tree *tree, struct page *page,
2765 gfp_t mask)
2766{
2767 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2768 u64 end = start + PAGE_CACHE_SIZE - 1;
2769 int ret = 1;
2770
2771 if (test_range_bit(tree, start, end,
2772 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2773 ret = 0;
2774 else {
2775 if ((mask & GFP_NOFS) == GFP_NOFS)
2776 mask = GFP_NOFS;
2777 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2778 1, 1, mask);
2779 }
2780 return ret;
2781}
2782
2783/*
2784 * a helper for releasepage. As long as there are no locked extents
2785 * in the range corresponding to the page, both state records and extent
2786 * map records are removed
2787 */
2788int try_release_extent_mapping(struct extent_map_tree *map,
2789 struct extent_io_tree *tree, struct page *page,
2790 gfp_t mask)
2791{
2792 struct extent_map *em;
2793 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2794 u64 end = start + PAGE_CACHE_SIZE - 1;
2795
2796 if ((mask & __GFP_WAIT) &&
2797 page->mapping->host->i_size > 16 * 1024 * 1024) {
2798 u64 len;
2799 while (start <= end) {
2800 len = end - start + 1;
2801 spin_lock(&map->lock);
2802 em = lookup_extent_mapping(map, start, len);
2803 if (!em || IS_ERR(em)) {
2804 spin_unlock(&map->lock);
2805 break;
2806 }
2807 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2808 em->start != start) {
2809 spin_unlock(&map->lock);
2810 free_extent_map(em);
2811 break;
2812 }
2813 if (!test_range_bit(tree, em->start,
2814 extent_map_end(em) - 1,
2815 EXTENT_LOCKED | EXTENT_WRITEBACK |
2816 EXTENT_ORDERED,
2817 0)) {
2818 remove_extent_mapping(map, em);
2819 /* once for the rb tree */
2820 free_extent_map(em);
2821 }
2822 start = extent_map_end(em);
2823 spin_unlock(&map->lock);
2824
2825 /* once for us */
2826 free_extent_map(em);
2827 }
2828 }
2829 return try_release_extent_state(map, tree, page, mask);
2830}
2831
2832sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2833 get_extent_t *get_extent)
2834{
2835 struct inode *inode = mapping->host;
2836 u64 start = iblock << inode->i_blkbits;
2837 sector_t sector = 0;
2838 size_t blksize = (1 << inode->i_blkbits);
2839 struct extent_map *em;
2840
2841 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2842 GFP_NOFS);
2843 em = get_extent(inode, NULL, 0, start, blksize, 0);
2844 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2845 GFP_NOFS);
2846 if (!em || IS_ERR(em))
2847 return 0;
2848
2849 if (em->block_start > EXTENT_MAP_LAST_BYTE)
2850 goto out;
2851
2852 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2853out:
2854 free_extent_map(em);
2855 return sector;
2856}
2857
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i)
2860{
2861 struct page *p;
2862 struct address_space *mapping;
2863
2864 if (i == 0)
2865 return eb->first_page;
2866 i += eb->start >> PAGE_CACHE_SHIFT;
2867 mapping = eb->first_page->mapping;
2868 if (!mapping)
2869 return NULL;
2870
2871 /*
2872 * extent_buffer_page is only called after pinning the page
2873 * by increasing the reference count. So we know the page must
2874 * be in the radix tree.
2875 */
2876 rcu_read_lock();
2877 p = radix_tree_lookup(&mapping->page_tree, i);
2878 rcu_read_unlock();
2879
2880 return p;
2881}
2882
2883static inline unsigned long num_extent_pages(u64 start, u64 len)
2884{
2885 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2886 (start >> PAGE_CACHE_SHIFT);
2887}
2888
2889static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2890 u64 start,
2891 unsigned long len,
2892 gfp_t mask)
2893{
2894 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG
2896 unsigned long flags;
2897#endif
2898
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start;
2901 eb->len = len;
2902 mutex_init(&eb->mutex);
2903#ifdef LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags);
2907#endif
2908 atomic_set(&eb->refs, 1);
2909
2910 return eb;
2911}
2912
2913static void __free_extent_buffer(struct extent_buffer *eb)
2914{
2915#ifdef LEAK_DEBUG
2916 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list);
2919 spin_unlock_irqrestore(&leak_lock, flags);
2920#endif
2921 kmem_cache_free(extent_buffer_cache, eb);
2922}
2923
2924struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2925 u64 start, unsigned long len,
2926 struct page *page0,
2927 gfp_t mask)
2928{
2929 unsigned long num_pages = num_extent_pages(start, len);
2930 unsigned long i;
2931 unsigned long index = start >> PAGE_CACHE_SHIFT;
2932 struct extent_buffer *eb;
2933 struct extent_buffer *exists = NULL;
2934 struct page *p;
2935 struct address_space *mapping = tree->mapping;
2936 int uptodate = 1;
2937
2938 spin_lock(&tree->buffer_lock);
2939 eb = buffer_search(tree, start);
2940 if (eb) {
2941 atomic_inc(&eb->refs);
2942 spin_unlock(&tree->buffer_lock);
2943 mark_page_accessed(eb->first_page);
2944 return eb;
2945 }
2946 spin_unlock(&tree->buffer_lock);
2947
2948 eb = __alloc_extent_buffer(tree, start, len, mask);
2949 if (!eb)
2950 return NULL;
2951
2952 if (page0) {
2953 eb->first_page = page0;
2954 i = 1;
2955 index++;
2956 page_cache_get(page0);
2957 mark_page_accessed(page0);
2958 set_page_extent_mapped(page0);
2959 set_page_extent_head(page0, len);
2960 uptodate = PageUptodate(page0);
2961 } else {
2962 i = 0;
2963 }
2964 for (; i < num_pages; i++, index++) {
2965 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2966 if (!p) {
2967 WARN_ON(1);
2968 goto free_eb;
2969 }
2970 set_page_extent_mapped(p);
2971 mark_page_accessed(p);
2972 if (i == 0) {
2973 eb->first_page = p;
2974 set_page_extent_head(p, len);
2975 } else {
2976 set_page_private(p, EXTENT_PAGE_PRIVATE);
2977 }
2978 if (!PageUptodate(p))
2979 uptodate = 0;
2980 unlock_page(p);
2981 }
2982 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE;
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985
2986 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2988 if (exists) {
2989 /* add one reference for the caller */
2990 atomic_inc(&exists->refs);
2991 spin_unlock(&tree->buffer_lock);
2992 goto free_eb;
2993 }
2994 spin_unlock(&tree->buffer_lock);
2995
2996 /* add one reference for the tree */
2997 atomic_inc(&eb->refs);
2998 return eb;
2999
3000free_eb:
3001 if (!atomic_dec_and_test(&eb->refs))
3002 return exists;
3003 for (index = 1; index < i; index++)
3004 page_cache_release(extent_buffer_page(eb, index));
3005 page_cache_release(extent_buffer_page(eb, 0));
3006 __free_extent_buffer(eb);
3007 return exists;
3008}
3009
3010struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3011 u64 start, unsigned long len,
3012 gfp_t mask)
3013{
3014 struct extent_buffer *eb;
3015
3016 spin_lock(&tree->buffer_lock);
3017 eb = buffer_search(tree, start);
3018 if (eb)
3019 atomic_inc(&eb->refs);
3020 spin_unlock(&tree->buffer_lock);
3021
3022 if (eb)
3023 mark_page_accessed(eb->first_page);
3024
3025 return eb;
3026}
3027
3028void free_extent_buffer(struct extent_buffer *eb)
3029{
3030 if (!eb)
3031 return;
3032
3033 if (!atomic_dec_and_test(&eb->refs))
3034 return;
3035
3036 WARN_ON(1);
3037}
3038
3039int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3040 struct extent_buffer *eb)
3041{
3042 int set;
3043 unsigned long i;
3044 unsigned long num_pages;
3045 struct page *page;
3046
3047 u64 start = eb->start;
3048 u64 end = start + eb->len - 1;
3049
3050 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3051 num_pages = num_extent_pages(eb->start, eb->len);
3052
3053 for (i = 0; i < num_pages; i++) {
3054 page = extent_buffer_page(eb, i);
3055 if (!set && !PageDirty(page))
3056 continue;
3057
3058 lock_page(page);
3059 if (i == 0)
3060 set_page_extent_head(page, eb->len);
3061 else
3062 set_page_private(page, EXTENT_PAGE_PRIVATE);
3063
3064 /*
3065 * if we're on the last page or the first page and the
3066 * block isn't aligned on a page boundary, do extra checks
3067 * to make sure we don't clean page that is partially dirty
3068 */
3069 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3070 ((i == num_pages - 1) &&
3071 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3072 start = (u64)page->index << PAGE_CACHE_SHIFT;
3073 end = start + PAGE_CACHE_SIZE - 1;
3074 if (test_range_bit(tree, start, end,
3075 EXTENT_DIRTY, 0)) {
3076 unlock_page(page);
3077 continue;
3078 }
3079 }
3080 clear_page_dirty_for_io(page);
3081 spin_lock_irq(&page->mapping->tree_lock);
3082 if (!PageDirty(page)) {
3083 radix_tree_tag_clear(&page->mapping->page_tree,
3084 page_index(page),
3085 PAGECACHE_TAG_DIRTY);
3086 }
3087 spin_unlock_irq(&page->mapping->tree_lock);
3088 unlock_page(page);
3089 }
3090 return 0;
3091}
3092
3093int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3094 struct extent_buffer *eb)
3095{
3096 return wait_on_extent_writeback(tree, eb->start,
3097 eb->start + eb->len - 1);
3098}
3099
3100int set_extent_buffer_dirty(struct extent_io_tree *tree,
3101 struct extent_buffer *eb)
3102{
3103 unsigned long i;
3104 unsigned long num_pages;
3105
3106 num_pages = num_extent_pages(eb->start, eb->len);
3107 for (i = 0; i < num_pages; i++) {
3108 struct page *page = extent_buffer_page(eb, i);
3109 /* writepage may need to do something special for the
3110 * first page, we have to make sure page->private is
3111 * properly set. releasepage may drop page->private
3112 * on us if the page isn't already dirty.
3113 */
3114 lock_page(page);
3115 if (i == 0) {
3116 set_page_extent_head(page, eb->len);
3117 } else if (PagePrivate(page) &&
3118 page->private != EXTENT_PAGE_PRIVATE) {
3119 set_page_extent_mapped(page);
3120 }
3121 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3122 set_extent_dirty(tree, page_offset(page),
3123 page_offset(page) + PAGE_CACHE_SIZE - 1,
3124 GFP_NOFS);
3125 unlock_page(page);
3126 }
3127 return 0;
3128}
3129
3130int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3131 struct extent_buffer *eb)
3132{
3133 unsigned long i;
3134 struct page *page;
3135 unsigned long num_pages;
3136
3137 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE;
3139
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS);
3142 for (i = 0; i < num_pages; i++) {
3143 page = extent_buffer_page(eb, i);
3144 if (page)
3145 ClearPageUptodate(page);
3146 }
3147 return 0;
3148}
3149
3150int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3151 struct extent_buffer *eb)
3152{
3153 unsigned long i;
3154 struct page *page;
3155 unsigned long num_pages;
3156
3157 num_pages = num_extent_pages(eb->start, eb->len);
3158
3159 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3160 GFP_NOFS);
3161 for (i = 0; i < num_pages; i++) {
3162 page = extent_buffer_page(eb, i);
3163 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3164 ((i == num_pages - 1) &&
3165 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3166 check_page_uptodate(tree, page);
3167 continue;
3168 }
3169 SetPageUptodate(page);
3170 }
3171 return 0;
3172}
3173
3174int extent_range_uptodate(struct extent_io_tree *tree,
3175 u64 start, u64 end)
3176{
3177 struct page *page;
3178 int ret;
3179 int pg_uptodate = 1;
3180 int uptodate;
3181 unsigned long index;
3182
3183 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3184 if (ret)
3185 return 1;
3186 while (start <= end) {
3187 index = start >> PAGE_CACHE_SHIFT;
3188 page = find_get_page(tree->mapping, index);
3189 uptodate = PageUptodate(page);
3190 page_cache_release(page);
3191 if (!uptodate) {
3192 pg_uptodate = 0;
3193 break;
3194 }
3195 start += PAGE_CACHE_SIZE;
3196 }
3197 return pg_uptodate;
3198}
3199
3200int extent_buffer_uptodate(struct extent_io_tree *tree,
3201 struct extent_buffer *eb)
3202{
3203 int ret = 0;
3204 unsigned long num_pages;
3205 unsigned long i;
3206 struct page *page;
3207 int pg_uptodate = 1;
3208
3209 if (eb->flags & EXTENT_UPTODATE)
3210 return 1;
3211
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3213 EXTENT_UPTODATE, 1);
3214 if (ret)
3215 return ret;
3216
3217 num_pages = num_extent_pages(eb->start, eb->len);
3218 for (i = 0; i < num_pages; i++) {
3219 page = extent_buffer_page(eb, i);
3220 if (!PageUptodate(page)) {
3221 pg_uptodate = 0;
3222 break;
3223 }
3224 }
3225 return pg_uptodate;
3226}
3227
3228int read_extent_buffer_pages(struct extent_io_tree *tree,
3229 struct extent_buffer *eb,
3230 u64 start, int wait,
3231 get_extent_t *get_extent, int mirror_num)
3232{
3233 unsigned long i;
3234 unsigned long start_i;
3235 struct page *page;
3236 int err;
3237 int ret = 0;
3238 int locked_pages = 0;
3239 int all_uptodate = 1;
3240 int inc_all_pages = 0;
3241 unsigned long num_pages;
3242 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0;
3244
3245 if (eb->flags & EXTENT_UPTODATE)
3246 return 0;
3247
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3249 EXTENT_UPTODATE, 1)) {
3250 return 0;
3251 }
3252
3253 if (start) {
3254 WARN_ON(start < eb->start);
3255 start_i = (start >> PAGE_CACHE_SHIFT) -
3256 (eb->start >> PAGE_CACHE_SHIFT);
3257 } else {
3258 start_i = 0;
3259 }
3260
3261 num_pages = num_extent_pages(eb->start, eb->len);
3262 for (i = start_i; i < num_pages; i++) {
3263 page = extent_buffer_page(eb, i);
3264 if (!wait) {
3265 if (!trylock_page(page))
3266 goto unlock_exit;
3267 } else {
3268 lock_page(page);
3269 }
3270 locked_pages++;
3271 if (!PageUptodate(page))
3272 all_uptodate = 0;
3273 }
3274 if (all_uptodate) {
3275 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE;
3277 goto unlock_exit;
3278 }
3279
3280 for (i = start_i; i < num_pages; i++) {
3281 page = extent_buffer_page(eb, i);
3282 if (inc_all_pages)
3283 page_cache_get(page);
3284 if (!PageUptodate(page)) {
3285 if (start_i == 0)
3286 inc_all_pages = 1;
3287 ClearPageError(page);
3288 err = __extent_read_full_page(tree, page,
3289 get_extent, &bio,
3290 mirror_num, &bio_flags);
3291 if (err)
3292 ret = err;
3293 } else {
3294 unlock_page(page);
3295 }
3296 }
3297
3298 if (bio)
3299 submit_one_bio(READ, bio, mirror_num, bio_flags);
3300
3301 if (ret || !wait)
3302 return ret;
3303
3304 for (i = start_i; i < num_pages; i++) {
3305 page = extent_buffer_page(eb, i);
3306 wait_on_page_locked(page);
3307 if (!PageUptodate(page))
3308 ret = -EIO;
3309 }
3310
3311 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE;
3313 return ret;
3314
3315unlock_exit:
3316 i = start_i;
3317 while (locked_pages > 0) {
3318 page = extent_buffer_page(eb, i);
3319 i++;
3320 unlock_page(page);
3321 locked_pages--;
3322 }
3323 return ret;
3324}
3325
3326void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3327 unsigned long start,
3328 unsigned long len)
3329{
3330 size_t cur;
3331 size_t offset;
3332 struct page *page;
3333 char *kaddr;
3334 char *dst = (char *)dstv;
3335 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3336 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3337
3338 WARN_ON(start > eb->len);
3339 WARN_ON(start + len > eb->start + eb->len);
3340
3341 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3342
3343 while (len > 0) {
3344 page = extent_buffer_page(eb, i);
3345
3346 cur = min(len, (PAGE_CACHE_SIZE - offset));
3347 kaddr = kmap_atomic(page, KM_USER1);
3348 memcpy(dst, kaddr + offset, cur);
3349 kunmap_atomic(kaddr, KM_USER1);
3350
3351 dst += cur;
3352 len -= cur;
3353 offset = 0;
3354 i++;
3355 }
3356}
3357
3358int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3359 unsigned long min_len, char **token, char **map,
3360 unsigned long *map_start,
3361 unsigned long *map_len, int km)
3362{
3363 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3364 char *kaddr;
3365 struct page *p;
3366 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3367 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3368 unsigned long end_i = (start_offset + start + min_len - 1) >>
3369 PAGE_CACHE_SHIFT;
3370
3371 if (i != end_i)
3372 return -EINVAL;
3373
3374 if (i == 0) {
3375 offset = start_offset;
3376 *map_start = 0;
3377 } else {
3378 offset = 0;
3379 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3380 }
3381
3382 if (start + min_len > eb->len) {
3383 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3384 "wanted %lu %lu\n", (unsigned long long)eb->start,
3385 eb->len, start, min_len);
3386 WARN_ON(1);
3387 }
3388
3389 p = extent_buffer_page(eb, i);
3390 kaddr = kmap_atomic(p, km);
3391 *token = kaddr;
3392 *map = kaddr + offset;
3393 *map_len = PAGE_CACHE_SIZE - offset;
3394 return 0;
3395}
3396
3397int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3398 unsigned long min_len,
3399 char **token, char **map,
3400 unsigned long *map_start,
3401 unsigned long *map_len, int km)
3402{
3403 int err;
3404 int save = 0;
3405 if (eb->map_token) {
3406 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL;
3408 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km);
3413 if (!err && save) {
3414 eb->map_token = *token;
3415 eb->kaddr = *map;
3416 eb->map_start = *map_start;
3417 eb->map_len = *map_len;
3418 }
3419 return err;
3420}
3421
3422void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3423{
3424 kunmap_atomic(token, km);
3425}
3426
3427int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3428 unsigned long start,
3429 unsigned long len)
3430{
3431 size_t cur;
3432 size_t offset;
3433 struct page *page;
3434 char *kaddr;
3435 char *ptr = (char *)ptrv;
3436 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3437 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3438 int ret = 0;
3439
3440 WARN_ON(start > eb->len);
3441 WARN_ON(start + len > eb->start + eb->len);
3442
3443 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3444
3445 while (len > 0) {
3446 page = extent_buffer_page(eb, i);
3447
3448 cur = min(len, (PAGE_CACHE_SIZE - offset));
3449
3450 kaddr = kmap_atomic(page, KM_USER0);
3451 ret = memcmp(ptr, kaddr + offset, cur);
3452 kunmap_atomic(kaddr, KM_USER0);
3453 if (ret)
3454 break;
3455
3456 ptr += cur;
3457 len -= cur;
3458 offset = 0;
3459 i++;
3460 }
3461 return ret;
3462}
3463
3464void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3465 unsigned long start, unsigned long len)
3466{
3467 size_t cur;
3468 size_t offset;
3469 struct page *page;
3470 char *kaddr;
3471 char *src = (char *)srcv;
3472 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3473 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3474
3475 WARN_ON(start > eb->len);
3476 WARN_ON(start + len > eb->start + eb->len);
3477
3478 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3479
3480 while (len > 0) {
3481 page = extent_buffer_page(eb, i);
3482 WARN_ON(!PageUptodate(page));
3483
3484 cur = min(len, PAGE_CACHE_SIZE - offset);
3485 kaddr = kmap_atomic(page, KM_USER1);
3486 memcpy(kaddr + offset, src, cur);
3487 kunmap_atomic(kaddr, KM_USER1);
3488
3489 src += cur;
3490 len -= cur;
3491 offset = 0;
3492 i++;
3493 }
3494}
3495
3496void memset_extent_buffer(struct extent_buffer *eb, char c,
3497 unsigned long start, unsigned long len)
3498{
3499 size_t cur;
3500 size_t offset;
3501 struct page *page;
3502 char *kaddr;
3503 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3504 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3505
3506 WARN_ON(start > eb->len);
3507 WARN_ON(start + len > eb->start + eb->len);
3508
3509 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3510
3511 while (len > 0) {
3512 page = extent_buffer_page(eb, i);
3513 WARN_ON(!PageUptodate(page));
3514
3515 cur = min(len, PAGE_CACHE_SIZE - offset);
3516 kaddr = kmap_atomic(page, KM_USER0);
3517 memset(kaddr + offset, c, cur);
3518 kunmap_atomic(kaddr, KM_USER0);
3519
3520 len -= cur;
3521 offset = 0;
3522 i++;
3523 }
3524}
3525
3526void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3527 unsigned long dst_offset, unsigned long src_offset,
3528 unsigned long len)
3529{
3530 u64 dst_len = dst->len;
3531 size_t cur;
3532 size_t offset;
3533 struct page *page;
3534 char *kaddr;
3535 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3536 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3537
3538 WARN_ON(src->len != dst_len);
3539
3540 offset = (start_offset + dst_offset) &
3541 ((unsigned long)PAGE_CACHE_SIZE - 1);
3542
3543 while (len > 0) {
3544 page = extent_buffer_page(dst, i);
3545 WARN_ON(!PageUptodate(page));
3546
3547 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3548
3549 kaddr = kmap_atomic(page, KM_USER0);
3550 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3551 kunmap_atomic(kaddr, KM_USER0);
3552
3553 src_offset += cur;
3554 len -= cur;
3555 offset = 0;
3556 i++;
3557 }
3558}
3559
3560static void move_pages(struct page *dst_page, struct page *src_page,
3561 unsigned long dst_off, unsigned long src_off,
3562 unsigned long len)
3563{
3564 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3565 if (dst_page == src_page) {
3566 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3567 } else {
3568 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3569 char *p = dst_kaddr + dst_off + len;
3570 char *s = src_kaddr + src_off + len;
3571
3572 while (len--)
3573 *--p = *--s;
3574
3575 kunmap_atomic(src_kaddr, KM_USER1);
3576 }
3577 kunmap_atomic(dst_kaddr, KM_USER0);
3578}
3579
3580static void copy_pages(struct page *dst_page, struct page *src_page,
3581 unsigned long dst_off, unsigned long src_off,
3582 unsigned long len)
3583{
3584 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3585 char *src_kaddr;
3586
3587 if (dst_page != src_page)
3588 src_kaddr = kmap_atomic(src_page, KM_USER1);
3589 else
3590 src_kaddr = dst_kaddr;
3591
3592 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3593 kunmap_atomic(dst_kaddr, KM_USER0);
3594 if (dst_page != src_page)
3595 kunmap_atomic(src_kaddr, KM_USER1);
3596}
3597
3598void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3599 unsigned long src_offset, unsigned long len)
3600{
3601 size_t cur;
3602 size_t dst_off_in_page;
3603 size_t src_off_in_page;
3604 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3605 unsigned long dst_i;
3606 unsigned long src_i;
3607
3608 if (src_offset + len > dst->len) {
3609 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3610 "len %lu dst len %lu\n", src_offset, len, dst->len);
3611 BUG_ON(1);
3612 }
3613 if (dst_offset + len > dst->len) {
3614 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3615 "len %lu dst len %lu\n", dst_offset, len, dst->len);
3616 BUG_ON(1);
3617 }
3618
3619 while (len > 0) {
3620 dst_off_in_page = (start_offset + dst_offset) &
3621 ((unsigned long)PAGE_CACHE_SIZE - 1);
3622 src_off_in_page = (start_offset + src_offset) &
3623 ((unsigned long)PAGE_CACHE_SIZE - 1);
3624
3625 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3626 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3627
3628 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3629 src_off_in_page));
3630 cur = min_t(unsigned long, cur,
3631 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3632
3633 copy_pages(extent_buffer_page(dst, dst_i),
3634 extent_buffer_page(dst, src_i),
3635 dst_off_in_page, src_off_in_page, cur);
3636
3637 src_offset += cur;
3638 dst_offset += cur;
3639 len -= cur;
3640 }
3641}
3642
3643void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3644 unsigned long src_offset, unsigned long len)
3645{
3646 size_t cur;
3647 size_t dst_off_in_page;
3648 size_t src_off_in_page;
3649 unsigned long dst_end = dst_offset + len - 1;
3650 unsigned long src_end = src_offset + len - 1;
3651 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 unsigned long dst_i;
3653 unsigned long src_i;
3654
3655 if (src_offset + len > dst->len) {
3656 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3657 "len %lu len %lu\n", src_offset, len, dst->len);
3658 BUG_ON(1);
3659 }
3660 if (dst_offset + len > dst->len) {
3661 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3662 "len %lu len %lu\n", dst_offset, len, dst->len);
3663 BUG_ON(1);
3664 }
3665 if (dst_offset < src_offset) {
3666 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3667 return;
3668 }
3669 while (len > 0) {
3670 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3671 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3672
3673 dst_off_in_page = (start_offset + dst_end) &
3674 ((unsigned long)PAGE_CACHE_SIZE - 1);
3675 src_off_in_page = (start_offset + src_end) &
3676 ((unsigned long)PAGE_CACHE_SIZE - 1);
3677
3678 cur = min_t(unsigned long, len, src_off_in_page + 1);
3679 cur = min(cur, dst_off_in_page + 1);
3680 move_pages(extent_buffer_page(dst, dst_i),
3681 extent_buffer_page(dst, src_i),
3682 dst_off_in_page - cur + 1,
3683 src_off_in_page - cur + 1, cur);
3684
3685 dst_end -= cur;
3686 src_end -= cur;
3687 len -= cur;
3688 }
3689}
3690
3691int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3692{
3693 u64 start = page_offset(page);
3694 struct extent_buffer *eb;
3695 int ret = 1;
3696 unsigned long i;
3697 unsigned long num_pages;
3698
3699 spin_lock(&tree->buffer_lock);
3700 eb = buffer_search(tree, start);
3701 if (!eb)
3702 goto out;
3703
3704 if (atomic_read(&eb->refs) > 1) {
3705 ret = 0;
3706 goto out;
3707 }
3708 /* at this point we can safely release the extent buffer */
3709 num_pages = num_extent_pages(eb->start, eb->len);
3710 for (i = 0; i < num_pages; i++)
3711 page_cache_release(extent_buffer_page(eb, i));
3712 rb_erase(&eb->rb_node, &tree->buffer);
3713 __free_extent_buffer(eb);
3714out:
3715 spin_unlock(&tree->buffer_lock);
3716 return ret;
3717}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21
22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1
24
25/*
26 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one.
28 */
29#define EXTENT_PAGE_PRIVATE 1
30#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
31
32struct extent_state;
33
34typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 struct bio *bio, int mirror_num,
36 unsigned long bio_flags);
37struct extent_io_ops {
38 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
39 u64 start, u64 end, int *page_started,
40 unsigned long *nr_written);
41 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
42 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
43 extent_submit_bio_hook_t *submit_bio_hook;
44 int (*merge_bio_hook)(struct page *page, unsigned long offset,
45 size_t size, struct bio *bio,
46 unsigned long bio_flags);
47 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
48 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
49 u64 start, u64 end,
50 struct extent_state *state);
51 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
52 u64 start, u64 end,
53 struct extent_state *state);
54 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
55 struct extent_state *state);
56 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
57 struct extent_state *state, int uptodate);
58 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
59 unsigned long old, unsigned long bits);
60 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
61 unsigned long old, unsigned long bits);
62 int (*write_cache_pages_lock_hook)(struct page *page);
63};
64
65struct extent_io_tree {
66 struct rb_root state;
67 struct rb_root buffer;
68 struct address_space *mapping;
69 u64 dirty_bytes;
70 spinlock_t lock;
71 spinlock_t buffer_lock;
72 struct extent_io_ops *ops;
73};
74
75struct extent_state {
76 u64 start;
77 u64 end; /* inclusive */
78 struct rb_node rb_node;
79 struct extent_io_tree *tree;
80 wait_queue_head_t wq;
81 atomic_t refs;
82 unsigned long state;
83
84 /* for use by the FS */
85 u64 private;
86
87 struct list_head leak_list;
88};
89
90struct extent_buffer {
91 u64 start;
92 unsigned long len;
93 char *map_token;
94 char *kaddr;
95 unsigned long map_start;
96 unsigned long map_len;
97 struct page *first_page;
98 atomic_t refs;
99 int flags;
100 struct list_head leak_list;
101 struct rb_node rb_node;
102 struct mutex mutex;
103};
104
105struct extent_map_tree;
106
107static inline struct extent_state *extent_state_next(struct extent_state *state)
108{
109 struct rb_node *node;
110 node = rb_next(&state->rb_node);
111 if (!node)
112 return NULL;
113 return rb_entry(node, struct extent_state, rb_node);
114}
115
116typedef struct extent_map *(get_extent_t)(struct inode *inode,
117 struct page *page,
118 size_t page_offset,
119 u64 start, u64 len,
120 int create);
121
122void extent_io_tree_init(struct extent_io_tree *tree,
123 struct address_space *mapping, gfp_t mask);
124int try_release_extent_mapping(struct extent_map_tree *map,
125 struct extent_io_tree *tree, struct page *page,
126 gfp_t mask);
127int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
128int try_release_extent_state(struct extent_map_tree *map,
129 struct extent_io_tree *tree, struct page *page,
130 gfp_t mask);
131int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
132int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
133int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
134 gfp_t mask);
135int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
136 get_extent_t *get_extent);
137int __init extent_io_init(void);
138void extent_io_exit(void);
139
140u64 count_range_bits(struct extent_io_tree *tree,
141 u64 *start, u64 search_end,
142 u64 max_bytes, unsigned long bits);
143
144int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
145 int bits, int filled);
146int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
147 int bits, gfp_t mask);
148int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
149 int bits, int wake, int delete, gfp_t mask);
150int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
151 int bits, gfp_t mask);
152int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
153 gfp_t mask);
154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
155 gfp_t mask);
156int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
157 gfp_t mask);
158int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
159 gfp_t mask);
160int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
161 gfp_t mask);
162int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
163 u64 end, gfp_t mask);
164int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
165 gfp_t mask);
166int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask);
168int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
169 u64 *start_ret, u64 *end_ret, int bits);
170struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
171 u64 start, int bits);
172int extent_invalidatepage(struct extent_io_tree *tree,
173 struct page *page, unsigned long offset);
174int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
175 get_extent_t *get_extent,
176 struct writeback_control *wbc);
177int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
178 u64 start, u64 end, get_extent_t *get_extent,
179 int mode);
180int extent_writepages(struct extent_io_tree *tree,
181 struct address_space *mapping,
182 get_extent_t *get_extent,
183 struct writeback_control *wbc);
184int extent_readpages(struct extent_io_tree *tree,
185 struct address_space *mapping,
186 struct list_head *pages, unsigned nr_pages,
187 get_extent_t get_extent);
188int extent_prepare_write(struct extent_io_tree *tree,
189 struct inode *inode, struct page *page,
190 unsigned from, unsigned to, get_extent_t *get_extent);
191int extent_commit_write(struct extent_io_tree *tree,
192 struct inode *inode, struct page *page,
193 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
199void set_page_extent_mapped(struct page *page);
200
201struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
202 u64 start, unsigned long len,
203 struct page *page0,
204 gfp_t mask);
205struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
206 u64 start, unsigned long len,
207 gfp_t mask);
208void free_extent_buffer(struct extent_buffer *eb);
209int read_extent_buffer_pages(struct extent_io_tree *tree,
210 struct extent_buffer *eb, u64 start, int wait,
211 get_extent_t *get_extent, int mirror_num);
212
213static inline void extent_buffer_get(struct extent_buffer *eb)
214{
215 atomic_inc(&eb->refs);
216}
217
218int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
219 unsigned long start,
220 unsigned long len);
221void read_extent_buffer(struct extent_buffer *eb, void *dst,
222 unsigned long start,
223 unsigned long len);
224void write_extent_buffer(struct extent_buffer *eb, const void *src,
225 unsigned long start, unsigned long len);
226void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
227 unsigned long dst_offset, unsigned long src_offset,
228 unsigned long len);
229void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
230 unsigned long src_offset, unsigned long len);
231void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
232 unsigned long src_offset, unsigned long len);
233void memset_extent_buffer(struct extent_buffer *eb, char c,
234 unsigned long start, unsigned long len);
235int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
236 struct extent_buffer *eb);
237int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
238int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
239int clear_extent_buffer_dirty(struct extent_io_tree *tree,
240 struct extent_buffer *eb);
241int set_extent_buffer_dirty(struct extent_io_tree *tree,
242 struct extent_buffer *eb);
243int set_extent_buffer_uptodate(struct extent_io_tree *tree,
244 struct extent_buffer *eb);
245int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
246 struct extent_buffer *eb);
247int extent_buffer_uptodate(struct extent_io_tree *tree,
248 struct extent_buffer *eb);
249int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
250 unsigned long min_len, char **token, char **map,
251 unsigned long *map_start,
252 unsigned long *map_len, int km);
253int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
254 unsigned long min_len, char **token, char **map,
255 unsigned long *map_start,
256 unsigned long *map_len, int km);
257void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
258int release_extent_buffer_tail_pages(struct extent_buffer *eb);
259int extent_range_uptodate(struct extent_io_tree *tree,
260 u64 start, u64 end);
261int extent_clear_unlock_delalloc(struct inode *inode,
262 struct extent_io_tree *tree,
263 u64 start, u64 end, struct page *locked_page,
264 int unlock_page,
265 int clear_unlock,
266 int clear_delalloc, int clear_dirty,
267 int set_writeback,
268 int end_writeback);
269#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node **p = &root->rb_node;
93 struct rb_node *parent = NULL;
94 struct extent_map *entry;
95
96 while (*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node *n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while (n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while (prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while (prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
194 if (extent_map_end(prev) == next->start &&
195 prev->flags == next->flags &&
196 prev->bdev == next->bdev &&
197 ((next->block_start == EXTENT_MAP_HOLE &&
198 prev->block_start == EXTENT_MAP_HOLE) ||
199 (next->block_start == EXTENT_MAP_INLINE &&
200 prev->block_start == EXTENT_MAP_INLINE) ||
201 (next->block_start == EXTENT_MAP_DELALLOC &&
202 prev->block_start == EXTENT_MAP_DELALLOC) ||
203 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
204 next->block_start == extent_map_block_end(prev)))) {
205 return 1;
206 }
207 return 0;
208}
209
210/**
211 * add_extent_mapping - add new extent map to the extent tree
212 * @tree: tree to insert new map in
213 * @em: map to insert
214 *
215 * Insert @em into @tree or perform a simple forward/backward merge with
216 * existing mappings. The extent_map struct passed in will be inserted
217 * into the tree directly, with an additional reference taken, or a
218 * reference dropped if the merge attempt was sucessfull.
219 */
220int add_extent_mapping(struct extent_map_tree *tree,
221 struct extent_map *em)
222{
223 int ret = 0;
224 struct extent_map *merge = NULL;
225 struct rb_node *rb;
226 struct extent_map *exist;
227
228 exist = lookup_extent_mapping(tree, em->start, em->len);
229 if (exist) {
230 free_extent_map(exist);
231 ret = -EEXIST;
232 goto out;
233 }
234 assert_spin_locked(&tree->lock);
235 rb = tree_insert(&tree->map, em->start, &em->rb_node);
236 if (rb) {
237 ret = -EEXIST;
238 free_extent_map(merge);
239 goto out;
240 }
241 atomic_inc(&em->refs);
242 if (em->start != 0) {
243 rb = rb_prev(&em->rb_node);
244 if (rb)
245 merge = rb_entry(rb, struct extent_map, rb_node);
246 if (rb && mergable_maps(merge, em)) {
247 em->start = merge->start;
248 em->len += merge->len;
249 em->block_len += merge->block_len;
250 em->block_start = merge->block_start;
251 merge->in_tree = 0;
252 rb_erase(&merge->rb_node, &tree->map);
253 free_extent_map(merge);
254 }
255 }
256 rb = rb_next(&em->rb_node);
257 if (rb)
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && mergable_maps(em, merge)) {
260 em->len += merge->len;
261 em->block_len += merge->len;
262 rb_erase(&merge->rb_node, &tree->map);
263 merge->in_tree = 0;
264 free_extent_map(merge);
265 }
266out:
267 return ret;
268}
269EXPORT_SYMBOL(add_extent_mapping);
270
271/* simple helper to do math around the end of an extent, handling wrap */
272static u64 range_end(u64 start, u64 len)
273{
274 if (start + len < start)
275 return (u64)-1;
276 return start + len;
277}
278
279/**
280 * lookup_extent_mapping - lookup extent_map
281 * @tree: tree to lookup in
282 * @start: byte offset to start the search
283 * @len: length of the lookup range
284 *
285 * Find and return the first extent_map struct in @tree that intersects the
286 * [start, len] range. There may be additional objects in the tree that
287 * intersect, so check the object returned carefully to make sure that no
288 * additional lookups are needed.
289 */
290struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
291 u64 start, u64 len)
292{
293 struct extent_map *em;
294 struct rb_node *rb_node;
295 struct rb_node *prev = NULL;
296 struct rb_node *next = NULL;
297 u64 end = range_end(start, len);
298
299 assert_spin_locked(&tree->lock);
300 rb_node = __tree_search(&tree->map, start, &prev, &next);
301 if (!rb_node && prev) {
302 em = rb_entry(prev, struct extent_map, rb_node);
303 if (end > em->start && start < extent_map_end(em))
304 goto found;
305 }
306 if (!rb_node && next) {
307 em = rb_entry(next, struct extent_map, rb_node);
308 if (end > em->start && start < extent_map_end(em))
309 goto found;
310 }
311 if (!rb_node) {
312 em = NULL;
313 goto out;
314 }
315 if (IS_ERR(rb_node)) {
316 em = ERR_PTR(PTR_ERR(rb_node));
317 goto out;
318 }
319 em = rb_entry(rb_node, struct extent_map, rb_node);
320 if (end > em->start && start < extent_map_end(em))
321 goto found;
322
323 em = NULL;
324 goto out;
325
326found:
327 atomic_inc(&em->refs);
328out:
329 return em;
330}
331EXPORT_SYMBOL(lookup_extent_mapping);
332
333/**
334 * remove_extent_mapping - removes an extent_map from the extent tree
335 * @tree: extent tree to remove from
336 * @em: extent map beeing removed
337 *
338 * Removes @em from @tree. No reference counts are dropped, and no checks
339 * are done to see if the range is in use
340 */
341int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
342{
343 int ret = 0;
344
345 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
346 assert_spin_locked(&tree->lock);
347 rb_erase(&em->rb_node, &tree->map);
348 em->in_tree = 0;
349 return ret;
350}
351EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16
17struct extent_map {
18 struct rb_node rb_node;
19
20 /* all of these are in bytes */
21 u64 start;
22 u64 len;
23 u64 orig_start;
24 u64 block_start;
25 u64 block_len;
26 unsigned long flags;
27 struct block_device *bdev;
28 atomic_t refs;
29 int in_tree;
30};
31
32struct extent_map_tree {
33 struct rb_root map;
34 spinlock_t lock;
35};
36
37static inline u64 extent_map_end(struct extent_map *em)
38{
39 if (em->start + em->len < em->start)
40 return (u64)-1;
41 return em->start + em->len;
42}
43
44static inline u64 extent_map_block_end(struct extent_map *em)
45{
46 if (em->block_start + em->block_len < em->block_start)
47 return (u64)-1;
48 return em->block_start + em->block_len;
49}
50
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57
58struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void);
61void extent_map_exit(void);
62#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..964652435fd1
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 size) - 1))
30
31#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
32 sizeof(struct btrfs_ordered_sum)) / \
33 sizeof(struct btrfs_sector_sum) * \
34 (r)->sectorsize - (r)->sectorsize)
35
36int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root,
38 u64 objectid, u64 pos,
39 u64 disk_offset, u64 disk_num_bytes,
40 u64 num_bytes, u64 offset, u64 ram_bytes,
41 u8 compression, u8 encryption, u16 other_encoding)
42{
43 int ret = 0;
44 struct btrfs_file_extent_item *item;
45 struct btrfs_key file_key;
46 struct btrfs_path *path;
47 struct extent_buffer *leaf;
48
49 path = btrfs_alloc_path();
50 BUG_ON(!path);
51 file_key.objectid = objectid;
52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item));
57 if (ret < 0)
58 goto out;
59 BUG_ON(ret);
60 leaf = path->nodes[0];
61 item = btrfs_item_ptr(leaf, path->slots[0],
62 struct btrfs_file_extent_item);
63 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
64 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
65 btrfs_set_file_extent_offset(leaf, item, offset);
66 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
67 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
68 btrfs_set_file_extent_generation(leaf, item, trans->transid);
69 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
70 btrfs_set_file_extent_compression(leaf, item, compression);
71 btrfs_set_file_extent_encryption(leaf, item, encryption);
72 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
73
74 btrfs_mark_buffer_dirty(leaf);
75out:
76 btrfs_free_path(path);
77 return ret;
78}
79
80struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root,
82 struct btrfs_path *path,
83 u64 bytenr, int cow)
84{
85 int ret;
86 struct btrfs_key file_key;
87 struct btrfs_key found_key;
88 struct btrfs_csum_item *item;
89 struct extent_buffer *leaf;
90 u64 csum_offset = 0;
91 u16 csum_size =
92 btrfs_super_csum_size(&root->fs_info->super_copy);
93 int csums_in_item;
94
95 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
96 file_key.offset = bytenr;
97 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
98 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
99 if (ret < 0)
100 goto fail;
101 leaf = path->nodes[0];
102 if (ret > 0) {
103 ret = 1;
104 if (path->slots[0] == 0)
105 goto fail;
106 path->slots[0]--;
107 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
108 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
109 goto fail;
110
111 csum_offset = (bytenr - found_key.offset) >>
112 root->fs_info->sb->s_blocksize_bits;
113 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
114 csums_in_item /= csum_size;
115
116 if (csum_offset >= csums_in_item) {
117 ret = -EFBIG;
118 goto fail;
119 }
120 }
121 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
122 item = (struct btrfs_csum_item *)((unsigned char *)item +
123 csum_offset * csum_size);
124 return item;
125fail:
126 if (ret > 0)
127 ret = -ENOENT;
128 return ERR_PTR(ret);
129}
130
131
132int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
133 struct btrfs_root *root,
134 struct btrfs_path *path, u64 objectid,
135 u64 offset, int mod)
136{
137 int ret;
138 struct btrfs_key file_key;
139 int ins_len = mod < 0 ? -1 : 0;
140 int cow = mod != 0;
141
142 file_key.objectid = objectid;
143 file_key.offset = offset;
144 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
145 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
146 return ret;
147}
148
149
150int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
151 struct bio *bio, u32 *dst)
152{
153 u32 sum;
154 struct bio_vec *bvec = bio->bi_io_vec;
155 int bio_index = 0;
156 u64 offset;
157 u64 item_start_offset = 0;
158 u64 item_last_offset = 0;
159 u64 disk_bytenr;
160 u32 diff;
161 u16 csum_size =
162 btrfs_super_csum_size(&root->fs_info->super_copy);
163 int ret;
164 struct btrfs_path *path;
165 struct btrfs_csum_item *item = NULL;
166 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
167
168 path = btrfs_alloc_path();
169 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
170 path->reada = 2;
171
172 WARN_ON(bio->bi_vcnt <= 0);
173
174 disk_bytenr = (u64)bio->bi_sector << 9;
175 while (bio_index < bio->bi_vcnt) {
176 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
177 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
178 if (ret == 0)
179 goto found;
180
181 if (!item || disk_bytenr < item_start_offset ||
182 disk_bytenr >= item_last_offset) {
183 struct btrfs_key found_key;
184 u32 item_size;
185
186 if (item)
187 btrfs_release_path(root, path);
188 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
189 path, disk_bytenr, 0);
190 if (IS_ERR(item)) {
191 ret = PTR_ERR(item);
192 if (ret == -ENOENT || ret == -EFBIG)
193 ret = 0;
194 sum = 0;
195 if (BTRFS_I(inode)->root->root_key.objectid ==
196 BTRFS_DATA_RELOC_TREE_OBJECTID) {
197 set_extent_bits(io_tree, offset,
198 offset + bvec->bv_len - 1,
199 EXTENT_NODATASUM, GFP_NOFS);
200 } else {
201 printk(KERN_INFO "btrfs no csum found "
202 "for inode %lu start %llu\n",
203 inode->i_ino,
204 (unsigned long long)offset);
205 }
206 item = NULL;
207 btrfs_release_path(root, path);
208 goto found;
209 }
210 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
211 path->slots[0]);
212
213 item_start_offset = found_key.offset;
214 item_size = btrfs_item_size_nr(path->nodes[0],
215 path->slots[0]);
216 item_last_offset = item_start_offset +
217 (item_size / csum_size) *
218 root->sectorsize;
219 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
220 struct btrfs_csum_item);
221 }
222 /*
223 * this byte range must be able to fit inside
224 * a single leaf so it will also fit inside a u32
225 */
226 diff = disk_bytenr - item_start_offset;
227 diff = diff / root->sectorsize;
228 diff = diff * csum_size;
229
230 read_extent_buffer(path->nodes[0], &sum,
231 ((unsigned long)item) + diff,
232 csum_size);
233found:
234 if (dst)
235 *dst++ = sum;
236 else
237 set_state_private(io_tree, offset, sum);
238 disk_bytenr += bvec->bv_len;
239 bio_index++;
240 bvec++;
241 }
242 btrfs_free_path(path);
243 return 0;
244}
245
246int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
247 struct list_head *list)
248{
249 struct btrfs_key key;
250 struct btrfs_path *path;
251 struct extent_buffer *leaf;
252 struct btrfs_ordered_sum *sums;
253 struct btrfs_sector_sum *sector_sum;
254 struct btrfs_csum_item *item;
255 unsigned long offset;
256 int ret;
257 size_t size;
258 u64 csum_end;
259 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
260
261 path = btrfs_alloc_path();
262 BUG_ON(!path);
263
264 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
265 key.offset = start;
266 key.type = BTRFS_EXTENT_CSUM_KEY;
267
268 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
269 if (ret < 0)
270 goto fail;
271 if (ret > 0 && path->slots[0] > 0) {
272 leaf = path->nodes[0];
273 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
274 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
275 key.type == BTRFS_EXTENT_CSUM_KEY) {
276 offset = (start - key.offset) >>
277 root->fs_info->sb->s_blocksize_bits;
278 if (offset * csum_size <
279 btrfs_item_size_nr(leaf, path->slots[0] - 1))
280 path->slots[0]--;
281 }
282 }
283
284 while (start <= end) {
285 leaf = path->nodes[0];
286 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
287 ret = btrfs_next_leaf(root, path);
288 if (ret < 0)
289 goto fail;
290 if (ret > 0)
291 break;
292 leaf = path->nodes[0];
293 }
294
295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
296 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
297 key.type != BTRFS_EXTENT_CSUM_KEY)
298 break;
299
300 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
301 if (key.offset > end)
302 break;
303
304 if (key.offset > start)
305 start = key.offset;
306
307 size = btrfs_item_size_nr(leaf, path->slots[0]);
308 csum_end = key.offset + (size / csum_size) * root->sectorsize;
309 if (csum_end <= start) {
310 path->slots[0]++;
311 continue;
312 }
313
314 csum_end = min(csum_end, end + 1);
315 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
316 struct btrfs_csum_item);
317 while (start < csum_end) {
318 size = min_t(size_t, csum_end - start,
319 MAX_ORDERED_SUM_BYTES(root));
320 sums = kzalloc(btrfs_ordered_sum_size(root, size),
321 GFP_NOFS);
322 BUG_ON(!sums);
323
324 sector_sum = sums->sums;
325 sums->bytenr = start;
326 sums->len = size;
327
328 offset = (start - key.offset) >>
329 root->fs_info->sb->s_blocksize_bits;
330 offset *= csum_size;
331
332 while (size > 0) {
333 read_extent_buffer(path->nodes[0],
334 &sector_sum->sum,
335 ((unsigned long)item) +
336 offset, csum_size);
337 sector_sum->bytenr = start;
338
339 size -= root->sectorsize;
340 start += root->sectorsize;
341 offset += csum_size;
342 sector_sum++;
343 }
344 list_add_tail(&sums->list, list);
345 }
346 path->slots[0]++;
347 }
348 ret = 0;
349fail:
350 btrfs_free_path(path);
351 return ret;
352}
353
354int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
355 struct bio *bio, u64 file_start, int contig)
356{
357 struct btrfs_ordered_sum *sums;
358 struct btrfs_sector_sum *sector_sum;
359 struct btrfs_ordered_extent *ordered;
360 char *data;
361 struct bio_vec *bvec = bio->bi_io_vec;
362 int bio_index = 0;
363 unsigned long total_bytes = 0;
364 unsigned long this_sum_bytes = 0;
365 u64 offset;
366 u64 disk_bytenr;
367
368 WARN_ON(bio->bi_vcnt <= 0);
369 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
370 if (!sums)
371 return -ENOMEM;
372
373 sector_sum = sums->sums;
374 disk_bytenr = (u64)bio->bi_sector << 9;
375 sums->len = bio->bi_size;
376 INIT_LIST_HEAD(&sums->list);
377
378 if (contig)
379 offset = file_start;
380 else
381 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
382
383 ordered = btrfs_lookup_ordered_extent(inode, offset);
384 BUG_ON(!ordered);
385 sums->bytenr = ordered->start;
386
387 while (bio_index < bio->bi_vcnt) {
388 if (!contig)
389 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
390
391 if (!contig && (offset >= ordered->file_offset + ordered->len ||
392 offset < ordered->file_offset)) {
393 unsigned long bytes_left;
394 sums->len = this_sum_bytes;
395 this_sum_bytes = 0;
396 btrfs_add_ordered_sum(inode, ordered, sums);
397 btrfs_put_ordered_extent(ordered);
398
399 bytes_left = bio->bi_size - total_bytes;
400
401 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
402 GFP_NOFS);
403 BUG_ON(!sums);
404 sector_sum = sums->sums;
405 sums->len = bytes_left;
406 ordered = btrfs_lookup_ordered_extent(inode, offset);
407 BUG_ON(!ordered);
408 sums->bytenr = ordered->start;
409 }
410
411 data = kmap_atomic(bvec->bv_page, KM_USER0);
412 sector_sum->sum = ~(u32)0;
413 sector_sum->sum = btrfs_csum_data(root,
414 data + bvec->bv_offset,
415 sector_sum->sum,
416 bvec->bv_len);
417 kunmap_atomic(data, KM_USER0);
418 btrfs_csum_final(sector_sum->sum,
419 (char *)&sector_sum->sum);
420 sector_sum->bytenr = disk_bytenr;
421
422 sector_sum++;
423 bio_index++;
424 total_bytes += bvec->bv_len;
425 this_sum_bytes += bvec->bv_len;
426 disk_bytenr += bvec->bv_len;
427 offset += bvec->bv_len;
428 bvec++;
429 }
430 this_sum_bytes = 0;
431 btrfs_add_ordered_sum(inode, ordered, sums);
432 btrfs_put_ordered_extent(ordered);
433 return 0;
434}
435
436/*
437 * helper function for csum removal, this expects the
438 * key to describe the csum pointed to by the path, and it expects
439 * the csum to overlap the range [bytenr, len]
440 *
441 * The csum should not be entirely contained in the range and the
442 * range should not be entirely contained in the csum.
443 *
444 * This calls btrfs_truncate_item with the correct args based on the
445 * overlap, and fixes up the key as required.
446 */
447static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
448 struct btrfs_root *root,
449 struct btrfs_path *path,
450 struct btrfs_key *key,
451 u64 bytenr, u64 len)
452{
453 struct extent_buffer *leaf;
454 u16 csum_size =
455 btrfs_super_csum_size(&root->fs_info->super_copy);
456 u64 csum_end;
457 u64 end_byte = bytenr + len;
458 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
459 int ret;
460
461 leaf = path->nodes[0];
462 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
463 csum_end <<= root->fs_info->sb->s_blocksize_bits;
464 csum_end += key->offset;
465
466 if (key->offset < bytenr && csum_end <= end_byte) {
467 /*
468 * [ bytenr - len ]
469 * [ ]
470 * [csum ]
471 * A simple truncate off the end of the item
472 */
473 u32 new_size = (bytenr - key->offset) >> blocksize_bits;
474 new_size *= csum_size;
475 ret = btrfs_truncate_item(trans, root, path, new_size, 1);
476 BUG_ON(ret);
477 } else if (key->offset >= bytenr && csum_end > end_byte &&
478 end_byte > key->offset) {
479 /*
480 * [ bytenr - len ]
481 * [ ]
482 * [csum ]
483 * we need to truncate from the beginning of the csum
484 */
485 u32 new_size = (csum_end - end_byte) >> blocksize_bits;
486 new_size *= csum_size;
487
488 ret = btrfs_truncate_item(trans, root, path, new_size, 0);
489 BUG_ON(ret);
490
491 key->offset = end_byte;
492 ret = btrfs_set_item_key_safe(trans, root, path, key);
493 BUG_ON(ret);
494 } else {
495 BUG();
496 }
497 return 0;
498}
499
500/*
501 * deletes the csum items from the csum tree for a given
502 * range of bytes.
503 */
504int btrfs_del_csums(struct btrfs_trans_handle *trans,
505 struct btrfs_root *root, u64 bytenr, u64 len)
506{
507 struct btrfs_path *path;
508 struct btrfs_key key;
509 u64 end_byte = bytenr + len;
510 u64 csum_end;
511 struct extent_buffer *leaf;
512 int ret;
513 u16 csum_size =
514 btrfs_super_csum_size(&root->fs_info->super_copy);
515 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
516
517 root = root->fs_info->csum_root;
518
519 path = btrfs_alloc_path();
520
521 while (1) {
522 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
523 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY;
525
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) {
528 if (path->slots[0] == 0)
529 goto out;
530 path->slots[0]--;
531 }
532 leaf = path->nodes[0];
533 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
534
535 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
536 key.type != BTRFS_EXTENT_CSUM_KEY) {
537 break;
538 }
539
540 if (key.offset >= end_byte)
541 break;
542
543 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
544 csum_end <<= blocksize_bits;
545 csum_end += key.offset;
546
547 /* this csum ends before we start, we're done */
548 if (csum_end <= bytenr)
549 break;
550
551 /* delete the entire item, it is inside our range */
552 if (key.offset >= bytenr && csum_end <= end_byte) {
553 ret = btrfs_del_item(trans, root, path);
554 BUG_ON(ret);
555 if (key.offset == bytenr)
556 break;
557 } else if (key.offset < bytenr && csum_end > end_byte) {
558 unsigned long offset;
559 unsigned long shift_len;
560 unsigned long item_offset;
561 /*
562 * [ bytenr - len ]
563 * [csum ]
564 *
565 * Our bytes are in the middle of the csum,
566 * we need to split this item and insert a new one.
567 *
568 * But we can't drop the path because the
569 * csum could change, get removed, extended etc.
570 *
571 * The trick here is the max size of a csum item leaves
572 * enough room in the tree block for a single
573 * item header. So, we split the item in place,
574 * adding a new header pointing to the existing
575 * bytes. Then we loop around again and we have
576 * a nicely formed csum item that we can neatly
577 * truncate.
578 */
579 offset = (bytenr - key.offset) >> blocksize_bits;
580 offset *= csum_size;
581
582 shift_len = (len >> blocksize_bits) * csum_size;
583
584 item_offset = btrfs_item_ptr_offset(leaf,
585 path->slots[0]);
586
587 memset_extent_buffer(leaf, 0, item_offset + offset,
588 shift_len);
589 key.offset = bytenr;
590
591 /*
592 * btrfs_split_item returns -EAGAIN when the
593 * item changed size or key
594 */
595 ret = btrfs_split_item(trans, root, path, &key, offset);
596 BUG_ON(ret && ret != -EAGAIN);
597
598 key.offset = end_byte - 1;
599 } else {
600 ret = truncate_one_csum(trans, root, path,
601 &key, bytenr, len);
602 BUG_ON(ret);
603 if (key.offset < bytenr)
604 break;
605 }
606 btrfs_release_path(root, path);
607 }
608out:
609 btrfs_free_path(path);
610 return 0;
611}
612
613int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
614 struct btrfs_root *root,
615 struct btrfs_ordered_sum *sums)
616{
617 u64 bytenr;
618 int ret;
619 struct btrfs_key file_key;
620 struct btrfs_key found_key;
621 u64 next_offset;
622 u64 total_bytes = 0;
623 int found_next;
624 struct btrfs_path *path;
625 struct btrfs_csum_item *item;
626 struct btrfs_csum_item *item_end;
627 struct extent_buffer *leaf = NULL;
628 u64 csum_offset;
629 struct btrfs_sector_sum *sector_sum;
630 u32 nritems;
631 u32 ins_size;
632 char *eb_map;
633 char *eb_token;
634 unsigned long map_len;
635 unsigned long map_start;
636 u16 csum_size =
637 btrfs_super_csum_size(&root->fs_info->super_copy);
638
639 path = btrfs_alloc_path();
640 BUG_ON(!path);
641 sector_sum = sums->sums;
642again:
643 next_offset = (u64)-1;
644 found_next = 0;
645 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
646 file_key.offset = sector_sum->bytenr;
647 bytenr = sector_sum->bytenr;
648 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
649
650 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
651 if (!IS_ERR(item)) {
652 leaf = path->nodes[0];
653 ret = 0;
654 goto found;
655 }
656 ret = PTR_ERR(item);
657 if (ret == -EFBIG) {
658 u32 item_size;
659 /* we found one, but it isn't big enough yet */
660 leaf = path->nodes[0];
661 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
662 if ((item_size / csum_size) >=
663 MAX_CSUM_ITEMS(root, csum_size)) {
664 /* already at max size, make a new one */
665 goto insert;
666 }
667 } else {
668 int slot = path->slots[0] + 1;
669 /* we didn't find a csum item, insert one */
670 nritems = btrfs_header_nritems(path->nodes[0]);
671 if (path->slots[0] >= nritems - 1) {
672 ret = btrfs_next_leaf(root, path);
673 if (ret == 1)
674 found_next = 1;
675 if (ret != 0)
676 goto insert;
677 slot = 0;
678 }
679 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
680 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
681 found_key.type != BTRFS_EXTENT_CSUM_KEY) {
682 found_next = 1;
683 goto insert;
684 }
685 next_offset = found_key.offset;
686 found_next = 1;
687 goto insert;
688 }
689
690 /*
691 * at this point, we know the tree has an item, but it isn't big
692 * enough yet to put our csum in. Grow it
693 */
694 btrfs_release_path(root, path);
695 ret = btrfs_search_slot(trans, root, &file_key, path,
696 csum_size, 1);
697 if (ret < 0)
698 goto fail_unlock;
699
700 if (ret > 0) {
701 if (path->slots[0] == 0)
702 goto insert;
703 path->slots[0]--;
704 }
705
706 leaf = path->nodes[0];
707 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
708 csum_offset = (bytenr - found_key.offset) >>
709 root->fs_info->sb->s_blocksize_bits;
710
711 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
712 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
713 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
714 goto insert;
715 }
716
717 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
718 csum_size) {
719 u32 diff = (csum_offset + 1) * csum_size;
720
721 /*
722 * is the item big enough already? we dropped our lock
723 * before and need to recheck
724 */
725 if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
726 goto csum;
727
728 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
729 if (diff != csum_size)
730 goto insert;
731
732 ret = btrfs_extend_item(trans, root, path, diff);
733 BUG_ON(ret);
734 goto csum;
735 }
736
737insert:
738 btrfs_release_path(root, path);
739 csum_offset = 0;
740 if (found_next) {
741 u64 tmp = total_bytes + root->sectorsize;
742 u64 next_sector = sector_sum->bytenr;
743 struct btrfs_sector_sum *next = sector_sum + 1;
744
745 while (tmp < sums->len) {
746 if (next_sector + root->sectorsize != next->bytenr)
747 break;
748 tmp += root->sectorsize;
749 next_sector = next->bytenr;
750 next++;
751 }
752 tmp = min(tmp, next_offset - file_key.offset);
753 tmp >>= root->fs_info->sb->s_blocksize_bits;
754 tmp = max((u64)1, tmp);
755 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
756 ins_size = csum_size * tmp;
757 } else {
758 ins_size = csum_size;
759 }
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size);
762 if (ret < 0)
763 goto fail_unlock;
764 if (ret != 0) {
765 WARN_ON(1);
766 goto fail_unlock;
767 }
768csum:
769 leaf = path->nodes[0];
770 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
771 ret = 0;
772 item = (struct btrfs_csum_item *)((unsigned char *)item +
773 csum_offset * csum_size);
774found:
775 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL;
779 cond_resched();
780next_sector:
781
782 if (!eb_token ||
783 (unsigned long)item + csum_size >= map_start + map_len) {
784 int err;
785
786 if (eb_token)
787 unmap_extent_buffer(leaf, eb_token, KM_USER1);
788 eb_token = NULL;
789 err = map_private_extent_buffer(leaf, (unsigned long)item,
790 csum_size,
791 &eb_token, &eb_map,
792 &map_start, &map_len, KM_USER1);
793 if (err)
794 eb_token = NULL;
795 }
796 if (eb_token) {
797 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
798 &sector_sum->sum, csum_size);
799 } else {
800 write_extent_buffer(leaf, &sector_sum->sum,
801 (unsigned long)item, csum_size);
802 }
803
804 total_bytes += root->sectorsize;
805 sector_sum++;
806 if (total_bytes < sums->len) {
807 item = (struct btrfs_csum_item *)((char *)item +
808 csum_size);
809 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
810 sector_sum->bytenr) {
811 bytenr = sector_sum->bytenr;
812 goto next_sector;
813 }
814 }
815 if (eb_token) {
816 unmap_extent_buffer(leaf, eb_token, KM_USER1);
817 eb_token = NULL;
818 }
819 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path);
823 goto again;
824 }
825out:
826 btrfs_free_path(path);
827 return ret;
828
829fail_unlock:
830 goto out;
831}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..90268334145e
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user *buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/*
99 * after copy_from_user, pages need to be dirtied and we need to make
100 * sure holes are created between the current EOF and the start of
101 * any next extents (if required).
102 *
103 * this also makes the decision about creating an inline extent vs
104 * doing real data extents, marking pages dirty and delalloc as required.
105 */
106static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct file *file,
109 struct page **pages,
110 size_t num_pages,
111 loff_t pos,
112 size_t write_bytes)
113{
114 int err = 0;
115 int i;
116 struct inode *inode = fdentry(file)->d_inode;
117 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 u64 hint_byte;
119 u64 num_bytes;
120 u64 start_pos;
121 u64 end_of_last_block;
122 u64 end_pos = pos + write_bytes;
123 loff_t isize = i_size_read(inode);
124
125 start_pos = pos & ~((u64)root->sectorsize - 1);
126 num_bytes = (write_bytes + pos - start_pos +
127 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129 end_of_last_block = start_pos + num_bytes - 1;
130
131 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132 trans = btrfs_join_transaction(root, 1);
133 if (!trans) {
134 err = -ENOMEM;
135 goto out_unlock;
136 }
137 btrfs_set_trans_block_group(trans, inode);
138 hint_byte = 0;
139
140 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
141
142 /* check for reserved extents on each page, we don't want
143 * to reset the delalloc bit on things that already have
144 * extents reserved.
145 */
146 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
147 for (i = 0; i < num_pages; i++) {
148 struct page *p = pages[i];
149 SetPageUptodate(p);
150 ClearPageChecked(p);
151 set_page_dirty(p);
152 }
153 if (end_pos > isize) {
154 i_size_write(inode, end_pos);
155 btrfs_update_inode(trans, root, inode);
156 }
157 err = btrfs_end_transaction(trans, root);
158out_unlock:
159 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
160 return err;
161}
162
163/*
164 * this drops all the extents in the cache that intersect the range
165 * [start, end]. Existing extents are split as required.
166 */
167int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
168 int skip_pinned)
169{
170 struct extent_map *em;
171 struct extent_map *split = NULL;
172 struct extent_map *split2 = NULL;
173 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
174 u64 len = end - start + 1;
175 int ret;
176 int testend = 1;
177 unsigned long flags;
178 int compressed = 0;
179
180 WARN_ON(end < start);
181 if (end == (u64)-1) {
182 len = (u64)-1;
183 testend = 0;
184 }
185 while (1) {
186 if (!split)
187 split = alloc_extent_map(GFP_NOFS);
188 if (!split2)
189 split2 = alloc_extent_map(GFP_NOFS);
190
191 spin_lock(&em_tree->lock);
192 em = lookup_extent_mapping(em_tree, start, len);
193 if (!em) {
194 spin_unlock(&em_tree->lock);
195 break;
196 }
197 flags = em->flags;
198 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
199 spin_unlock(&em_tree->lock);
200 if (em->start <= start &&
201 (!testend || em->start + em->len >= start + len)) {
202 free_extent_map(em);
203 break;
204 }
205 if (start < em->start) {
206 len = em->start - start;
207 } else {
208 len = start + len - (em->start + em->len);
209 start = em->start + em->len;
210 }
211 free_extent_map(em);
212 continue;
213 }
214 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
215 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
216 remove_extent_mapping(em_tree, em);
217
218 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
219 em->start < start) {
220 split->start = em->start;
221 split->len = start - em->start;
222 split->orig_start = em->orig_start;
223 split->block_start = em->block_start;
224
225 if (compressed)
226 split->block_len = em->block_len;
227 else
228 split->block_len = split->len;
229
230 split->bdev = em->bdev;
231 split->flags = flags;
232 ret = add_extent_mapping(em_tree, split);
233 BUG_ON(ret);
234 free_extent_map(split);
235 split = split2;
236 split2 = NULL;
237 }
238 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
239 testend && em->start + em->len > start + len) {
240 u64 diff = start + len - em->start;
241
242 split->start = start + len;
243 split->len = em->start + em->len - (start + len);
244 split->bdev = em->bdev;
245 split->flags = flags;
246
247 if (compressed) {
248 split->block_len = em->block_len;
249 split->block_start = em->block_start;
250 split->orig_start = em->orig_start;
251 } else {
252 split->block_len = split->len;
253 split->block_start = em->block_start + diff;
254 split->orig_start = split->start;
255 }
256
257 ret = add_extent_mapping(em_tree, split);
258 BUG_ON(ret);
259 free_extent_map(split);
260 split = NULL;
261 }
262 spin_unlock(&em_tree->lock);
263
264 /* once for us */
265 free_extent_map(em);
266 /* once for the tree*/
267 free_extent_map(em);
268 }
269 if (split)
270 free_extent_map(split);
271 if (split2)
272 free_extent_map(split2);
273 return 0;
274}
275
276int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
277{
278 return 0;
279#if 0
280 struct btrfs_path *path;
281 struct btrfs_key found_key;
282 struct extent_buffer *leaf;
283 struct btrfs_file_extent_item *extent;
284 u64 last_offset = 0;
285 int nritems;
286 int slot;
287 int found_type;
288 int ret;
289 int err = 0;
290 u64 extent_end = 0;
291
292 path = btrfs_alloc_path();
293 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
294 last_offset, 0);
295 while (1) {
296 nritems = btrfs_header_nritems(path->nodes[0]);
297 if (path->slots[0] >= nritems) {
298 ret = btrfs_next_leaf(root, path);
299 if (ret)
300 goto out;
301 nritems = btrfs_header_nritems(path->nodes[0]);
302 }
303 slot = path->slots[0];
304 leaf = path->nodes[0];
305 btrfs_item_key_to_cpu(leaf, &found_key, slot);
306 if (found_key.objectid != inode->i_ino)
307 break;
308 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
309 goto out;
310
311 if (found_key.offset < last_offset) {
312 WARN_ON(1);
313 btrfs_print_leaf(root, leaf);
314 printk(KERN_ERR "inode %lu found offset %llu "
315 "expected %llu\n", inode->i_ino,
316 (unsigned long long)found_key.offset,
317 (unsigned long long)last_offset);
318 err = 1;
319 goto out;
320 }
321 extent = btrfs_item_ptr(leaf, slot,
322 struct btrfs_file_extent_item);
323 found_type = btrfs_file_extent_type(leaf, extent);
324 if (found_type == BTRFS_FILE_EXTENT_REG) {
325 extent_end = found_key.offset +
326 btrfs_file_extent_num_bytes(leaf, extent);
327 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
328 struct btrfs_item *item;
329 item = btrfs_item_nr(leaf, slot);
330 extent_end = found_key.offset +
331 btrfs_file_extent_inline_len(leaf, extent);
332 extent_end = (extent_end + root->sectorsize - 1) &
333 ~((u64)root->sectorsize - 1);
334 }
335 last_offset = extent_end;
336 path->slots[0]++;
337 }
338 if (0 && last_offset < inode->i_size) {
339 WARN_ON(1);
340 btrfs_print_leaf(root, leaf);
341 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
342 inode->i_ino, (unsigned long long)last_offset,
343 (unsigned long long)inode->i_size);
344 err = 1;
345
346 }
347out:
348 btrfs_free_path(path);
349 return err;
350#endif
351}
352
353/*
354 * this is very complex, but the basic idea is to drop all extents
355 * in the range start - end. hint_block is filled in with a block number
356 * that would be a good hint to the block allocator for this file.
357 *
358 * If an extent intersects the range but is not entirely inside the range
359 * it is either truncated or split. Anything entirely inside the range
360 * is deleted from the tree.
361 *
362 * inline_limit is used to tell this code which offsets in the file to keep
363 * if they contain inline extents.
364 */
365noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
366 struct btrfs_root *root, struct inode *inode,
367 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
368{
369 u64 extent_end = 0;
370 u64 locked_end = end;
371 u64 search_start = start;
372 u64 leaf_start;
373 u64 ram_bytes = 0;
374 u64 orig_parent = 0;
375 u64 disk_bytenr = 0;
376 u8 compression;
377 u8 encryption;
378 u16 other_encoding = 0;
379 u64 root_gen;
380 u64 root_owner;
381 struct extent_buffer *leaf;
382 struct btrfs_file_extent_item *extent;
383 struct btrfs_path *path;
384 struct btrfs_key key;
385 struct btrfs_file_extent_item old;
386 int keep;
387 int slot;
388 int bookend;
389 int found_type = 0;
390 int found_extent;
391 int found_inline;
392 int recow;
393 int ret;
394
395 inline_limit = 0;
396 btrfs_drop_extent_cache(inode, start, end - 1, 0);
397
398 path = btrfs_alloc_path();
399 if (!path)
400 return -ENOMEM;
401 while (1) {
402 recow = 0;
403 btrfs_release_path(root, path);
404 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
405 search_start, -1);
406 if (ret < 0)
407 goto out;
408 if (ret > 0) {
409 if (path->slots[0] == 0) {
410 ret = 0;
411 goto out;
412 }
413 path->slots[0]--;
414 }
415next_slot:
416 keep = 0;
417 bookend = 0;
418 found_extent = 0;
419 found_inline = 0;
420 leaf_start = 0;
421 root_gen = 0;
422 root_owner = 0;
423 compression = 0;
424 encryption = 0;
425 extent = NULL;
426 leaf = path->nodes[0];
427 slot = path->slots[0];
428 ret = 0;
429 btrfs_item_key_to_cpu(leaf, &key, slot);
430 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
431 key.offset >= end) {
432 goto out;
433 }
434 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
435 key.objectid != inode->i_ino) {
436 goto out;
437 }
438 if (recow) {
439 search_start = max(key.offset, start);
440 continue;
441 }
442 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
443 extent = btrfs_item_ptr(leaf, slot,
444 struct btrfs_file_extent_item);
445 found_type = btrfs_file_extent_type(leaf, extent);
446 compression = btrfs_file_extent_compression(leaf,
447 extent);
448 encryption = btrfs_file_extent_encryption(leaf,
449 extent);
450 other_encoding = btrfs_file_extent_other_encoding(leaf,
451 extent);
452 if (found_type == BTRFS_FILE_EXTENT_REG ||
453 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
454 extent_end =
455 btrfs_file_extent_disk_bytenr(leaf,
456 extent);
457 if (extent_end)
458 *hint_byte = extent_end;
459
460 extent_end = key.offset +
461 btrfs_file_extent_num_bytes(leaf, extent);
462 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
463 extent);
464 found_extent = 1;
465 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
466 found_inline = 1;
467 extent_end = key.offset +
468 btrfs_file_extent_inline_len(leaf, extent);
469 }
470 } else {
471 extent_end = search_start;
472 }
473
474 /* we found nothing we can drop */
475 if ((!found_extent && !found_inline) ||
476 search_start >= extent_end) {
477 int nextret;
478 u32 nritems;
479 nritems = btrfs_header_nritems(leaf);
480 if (slot >= nritems - 1) {
481 nextret = btrfs_next_leaf(root, path);
482 if (nextret)
483 goto out;
484 recow = 1;
485 } else {
486 path->slots[0]++;
487 }
488 goto next_slot;
489 }
490
491 if (end <= extent_end && start >= key.offset && found_inline)
492 *hint_byte = EXTENT_MAP_INLINE;
493
494 if (found_extent) {
495 read_extent_buffer(leaf, &old, (unsigned long)extent,
496 sizeof(old));
497 root_gen = btrfs_header_generation(leaf);
498 root_owner = btrfs_header_owner(leaf);
499 leaf_start = leaf->start;
500 }
501
502 if (end < extent_end && end >= key.offset) {
503 bookend = 1;
504 if (found_inline && start <= key.offset)
505 keep = 1;
506 }
507
508 if (bookend && found_extent) {
509 if (locked_end < extent_end) {
510 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
511 locked_end, extent_end - 1,
512 GFP_NOFS);
513 if (!ret) {
514 btrfs_release_path(root, path);
515 lock_extent(&BTRFS_I(inode)->io_tree,
516 locked_end, extent_end - 1,
517 GFP_NOFS);
518 locked_end = extent_end;
519 continue;
520 }
521 locked_end = extent_end;
522 }
523 orig_parent = path->nodes[0]->start;
524 disk_bytenr = le64_to_cpu(old.disk_bytenr);
525 if (disk_bytenr != 0) {
526 ret = btrfs_inc_extent_ref(trans, root,
527 disk_bytenr,
528 le64_to_cpu(old.disk_num_bytes),
529 orig_parent, root->root_key.objectid,
530 trans->transid, inode->i_ino);
531 BUG_ON(ret);
532 }
533 }
534
535 if (found_inline) {
536 u64 mask = root->sectorsize - 1;
537 search_start = (extent_end + mask) & ~mask;
538 } else
539 search_start = extent_end;
540
541 /* truncate existing extent */
542 if (start > key.offset) {
543 u64 new_num;
544 u64 old_num;
545 keep = 1;
546 WARN_ON(start & (root->sectorsize - 1));
547 if (found_extent) {
548 new_num = start - key.offset;
549 old_num = btrfs_file_extent_num_bytes(leaf,
550 extent);
551 *hint_byte =
552 btrfs_file_extent_disk_bytenr(leaf,
553 extent);
554 if (btrfs_file_extent_disk_bytenr(leaf,
555 extent)) {
556 inode_sub_bytes(inode, old_num -
557 new_num);
558 }
559 btrfs_set_file_extent_num_bytes(leaf,
560 extent, new_num);
561 btrfs_mark_buffer_dirty(leaf);
562 } else if (key.offset < inline_limit &&
563 (end > extent_end) &&
564 (inline_limit < extent_end)) {
565 u32 new_size;
566 new_size = btrfs_file_extent_calc_inline_size(
567 inline_limit - key.offset);
568 inode_sub_bytes(inode, extent_end -
569 inline_limit);
570 btrfs_set_file_extent_ram_bytes(leaf, extent,
571 new_size);
572 if (!compression && !encryption) {
573 btrfs_truncate_item(trans, root, path,
574 new_size, 1);
575 }
576 }
577 }
578 /* delete the entire extent */
579 if (!keep) {
580 if (found_inline)
581 inode_sub_bytes(inode, extent_end -
582 key.offset);
583 ret = btrfs_del_item(trans, root, path);
584 /* TODO update progress marker and return */
585 BUG_ON(ret);
586 extent = NULL;
587 btrfs_release_path(root, path);
588 /* the extent will be freed later */
589 }
590 if (bookend && found_inline && start <= key.offset) {
591 u32 new_size;
592 new_size = btrfs_file_extent_calc_inline_size(
593 extent_end - end);
594 inode_sub_bytes(inode, end - key.offset);
595 btrfs_set_file_extent_ram_bytes(leaf, extent,
596 new_size);
597 if (!compression && !encryption)
598 ret = btrfs_truncate_item(trans, root, path,
599 new_size, 0);
600 BUG_ON(ret);
601 }
602 /* create bookend, splitting the extent in two */
603 if (bookend && found_extent) {
604 struct btrfs_key ins;
605 ins.objectid = inode->i_ino;
606 ins.offset = end;
607 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
608
609 btrfs_release_path(root, path);
610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
611 sizeof(*extent));
612 BUG_ON(ret);
613
614 leaf = path->nodes[0];
615 extent = btrfs_item_ptr(leaf, path->slots[0],
616 struct btrfs_file_extent_item);
617 write_extent_buffer(leaf, &old,
618 (unsigned long)extent, sizeof(old));
619
620 btrfs_set_file_extent_compression(leaf, extent,
621 compression);
622 btrfs_set_file_extent_encryption(leaf, extent,
623 encryption);
624 btrfs_set_file_extent_other_encoding(leaf, extent,
625 other_encoding);
626 btrfs_set_file_extent_offset(leaf, extent,
627 le64_to_cpu(old.offset) + end - key.offset);
628 WARN_ON(le64_to_cpu(old.num_bytes) <
629 (extent_end - end));
630 btrfs_set_file_extent_num_bytes(leaf, extent,
631 extent_end - end);
632
633 /*
634 * set the ram bytes to the size of the full extent
635 * before splitting. This is a worst case flag,
636 * but its the best we can do because we don't know
637 * how splitting affects compression
638 */
639 btrfs_set_file_extent_ram_bytes(leaf, extent,
640 ram_bytes);
641 btrfs_set_file_extent_type(leaf, extent, found_type);
642
643 btrfs_mark_buffer_dirty(path->nodes[0]);
644
645 if (disk_bytenr != 0) {
646 ret = btrfs_update_extent_ref(trans, root,
647 disk_bytenr, orig_parent,
648 leaf->start,
649 root->root_key.objectid,
650 trans->transid, ins.objectid);
651
652 BUG_ON(ret);
653 }
654 btrfs_release_path(root, path);
655 if (disk_bytenr != 0)
656 inode_add_bytes(inode, extent_end - end);
657 }
658
659 if (found_extent && !keep) {
660 u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
661
662 if (old_disk_bytenr != 0) {
663 inode_sub_bytes(inode,
664 le64_to_cpu(old.num_bytes));
665 ret = btrfs_free_extent(trans, root,
666 old_disk_bytenr,
667 le64_to_cpu(old.disk_num_bytes),
668 leaf_start, root_owner,
669 root_gen, key.objectid, 0);
670 BUG_ON(ret);
671 *hint_byte = old_disk_bytenr;
672 }
673 }
674
675 if (search_start >= end) {
676 ret = 0;
677 goto out;
678 }
679 }
680out:
681 btrfs_free_path(path);
682 if (locked_end > end) {
683 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
684 GFP_NOFS);
685 }
686 btrfs_check_file(root, inode);
687 return ret;
688}
689
690static int extent_mergeable(struct extent_buffer *leaf, int slot,
691 u64 objectid, u64 bytenr, u64 *start, u64 *end)
692{
693 struct btrfs_file_extent_item *fi;
694 struct btrfs_key key;
695 u64 extent_end;
696
697 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
698 return 0;
699
700 btrfs_item_key_to_cpu(leaf, &key, slot);
701 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
702 return 0;
703
704 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
705 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
706 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
707 btrfs_file_extent_compression(leaf, fi) ||
708 btrfs_file_extent_encryption(leaf, fi) ||
709 btrfs_file_extent_other_encoding(leaf, fi))
710 return 0;
711
712 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
713 if ((*start && *start != key.offset) || (*end && *end != extent_end))
714 return 0;
715
716 *start = key.offset;
717 *end = extent_end;
718 return 1;
719}
720
721/*
722 * Mark extent in the range start - end as written.
723 *
724 * This changes extent type from 'pre-allocated' to 'regular'. If only
725 * part of extent is marked as written, the extent will be split into
726 * two or three.
727 */
728int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
729 struct btrfs_root *root,
730 struct inode *inode, u64 start, u64 end)
731{
732 struct extent_buffer *leaf;
733 struct btrfs_path *path;
734 struct btrfs_file_extent_item *fi;
735 struct btrfs_key key;
736 u64 bytenr;
737 u64 num_bytes;
738 u64 extent_end;
739 u64 extent_offset;
740 u64 other_start;
741 u64 other_end;
742 u64 split = start;
743 u64 locked_end = end;
744 u64 orig_parent;
745 int extent_type;
746 int split_end = 1;
747 int ret;
748
749 btrfs_drop_extent_cache(inode, start, end - 1, 0);
750
751 path = btrfs_alloc_path();
752 BUG_ON(!path);
753again:
754 key.objectid = inode->i_ino;
755 key.type = BTRFS_EXTENT_DATA_KEY;
756 if (split == start)
757 key.offset = split;
758 else
759 key.offset = split - 1;
760
761 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
762 if (ret > 0 && path->slots[0] > 0)
763 path->slots[0]--;
764
765 leaf = path->nodes[0];
766 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
767 BUG_ON(key.objectid != inode->i_ino ||
768 key.type != BTRFS_EXTENT_DATA_KEY);
769 fi = btrfs_item_ptr(leaf, path->slots[0],
770 struct btrfs_file_extent_item);
771 extent_type = btrfs_file_extent_type(leaf, fi);
772 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
773 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
774 BUG_ON(key.offset > start || extent_end < end);
775
776 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
777 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
778 extent_offset = btrfs_file_extent_offset(leaf, fi);
779
780 if (key.offset == start)
781 split = end;
782
783 if (key.offset == start && extent_end == end) {
784 int del_nr = 0;
785 int del_slot = 0;
786 u64 leaf_owner = btrfs_header_owner(leaf);
787 u64 leaf_gen = btrfs_header_generation(leaf);
788 other_start = end;
789 other_end = 0;
790 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
791 bytenr, &other_start, &other_end)) {
792 extent_end = other_end;
793 del_slot = path->slots[0] + 1;
794 del_nr++;
795 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
796 leaf->start, leaf_owner,
797 leaf_gen, inode->i_ino, 0);
798 BUG_ON(ret);
799 }
800 other_start = 0;
801 other_end = start;
802 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
803 bytenr, &other_start, &other_end)) {
804 key.offset = other_start;
805 del_slot = path->slots[0];
806 del_nr++;
807 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
808 leaf->start, leaf_owner,
809 leaf_gen, inode->i_ino, 0);
810 BUG_ON(ret);
811 }
812 split_end = 0;
813 if (del_nr == 0) {
814 btrfs_set_file_extent_type(leaf, fi,
815 BTRFS_FILE_EXTENT_REG);
816 goto done;
817 }
818
819 fi = btrfs_item_ptr(leaf, del_slot - 1,
820 struct btrfs_file_extent_item);
821 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
822 btrfs_set_file_extent_num_bytes(leaf, fi,
823 extent_end - key.offset);
824 btrfs_mark_buffer_dirty(leaf);
825
826 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
827 BUG_ON(ret);
828 goto done;
829 } else if (split == start) {
830 if (locked_end < extent_end) {
831 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
832 locked_end, extent_end - 1, GFP_NOFS);
833 if (!ret) {
834 btrfs_release_path(root, path);
835 lock_extent(&BTRFS_I(inode)->io_tree,
836 locked_end, extent_end - 1, GFP_NOFS);
837 locked_end = extent_end;
838 goto again;
839 }
840 locked_end = extent_end;
841 }
842 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
843 extent_offset += split - key.offset;
844 } else {
845 BUG_ON(key.offset != start);
846 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
847 split - key.offset);
848 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
849 key.offset = split;
850 btrfs_set_item_key_safe(trans, root, path, &key);
851 extent_end = split;
852 }
853
854 if (extent_end == end) {
855 split_end = 0;
856 extent_type = BTRFS_FILE_EXTENT_REG;
857 }
858 if (extent_end == end && split == start) {
859 other_start = end;
860 other_end = 0;
861 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
862 bytenr, &other_start, &other_end)) {
863 path->slots[0]++;
864 fi = btrfs_item_ptr(leaf, path->slots[0],
865 struct btrfs_file_extent_item);
866 key.offset = split;
867 btrfs_set_item_key_safe(trans, root, path, &key);
868 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
869 btrfs_set_file_extent_num_bytes(leaf, fi,
870 other_end - split);
871 goto done;
872 }
873 }
874 if (extent_end == end && split == end) {
875 other_start = 0;
876 other_end = start;
877 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
878 bytenr, &other_start, &other_end)) {
879 path->slots[0]--;
880 fi = btrfs_item_ptr(leaf, path->slots[0],
881 struct btrfs_file_extent_item);
882 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
883 other_start);
884 goto done;
885 }
886 }
887
888 btrfs_mark_buffer_dirty(leaf);
889
890 orig_parent = leaf->start;
891 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
892 orig_parent, root->root_key.objectid,
893 trans->transid, inode->i_ino);
894 BUG_ON(ret);
895 btrfs_release_path(root, path);
896
897 key.offset = start;
898 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
899 BUG_ON(ret);
900
901 leaf = path->nodes[0];
902 fi = btrfs_item_ptr(leaf, path->slots[0],
903 struct btrfs_file_extent_item);
904 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
905 btrfs_set_file_extent_type(leaf, fi, extent_type);
906 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
907 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
908 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
909 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
910 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
911 btrfs_set_file_extent_compression(leaf, fi, 0);
912 btrfs_set_file_extent_encryption(leaf, fi, 0);
913 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
914
915 if (orig_parent != leaf->start) {
916 ret = btrfs_update_extent_ref(trans, root, bytenr,
917 orig_parent, leaf->start,
918 root->root_key.objectid,
919 trans->transid, inode->i_ino);
920 BUG_ON(ret);
921 }
922done:
923 btrfs_mark_buffer_dirty(leaf);
924 btrfs_release_path(root, path);
925 if (split_end && split == start) {
926 split = end;
927 goto again;
928 }
929 if (locked_end > end) {
930 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
931 GFP_NOFS);
932 }
933 btrfs_free_path(path);
934 return 0;
935}
936
937/*
938 * this gets pages into the page cache and locks them down, it also properly
939 * waits for data=ordered extents to finish before allowing the pages to be
940 * modified.
941 */
942static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
943 struct page **pages, size_t num_pages,
944 loff_t pos, unsigned long first_index,
945 unsigned long last_index, size_t write_bytes)
946{
947 int i;
948 unsigned long index = pos >> PAGE_CACHE_SHIFT;
949 struct inode *inode = fdentry(file)->d_inode;
950 int err = 0;
951 u64 start_pos;
952 u64 last_pos;
953
954 start_pos = pos & ~((u64)root->sectorsize - 1);
955 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
956
957 if (start_pos > inode->i_size) {
958 err = btrfs_cont_expand(inode, start_pos);
959 if (err)
960 return err;
961 }
962
963 memset(pages, 0, num_pages * sizeof(struct page *));
964again:
965 for (i = 0; i < num_pages; i++) {
966 pages[i] = grab_cache_page(inode->i_mapping, index + i);
967 if (!pages[i]) {
968 err = -ENOMEM;
969 BUG_ON(1);
970 }
971 wait_on_page_writeback(pages[i]);
972 }
973 if (start_pos < inode->i_size) {
974 struct btrfs_ordered_extent *ordered;
975 lock_extent(&BTRFS_I(inode)->io_tree,
976 start_pos, last_pos - 1, GFP_NOFS);
977 ordered = btrfs_lookup_first_ordered_extent(inode,
978 last_pos - 1);
979 if (ordered &&
980 ordered->file_offset + ordered->len > start_pos &&
981 ordered->file_offset < last_pos) {
982 btrfs_put_ordered_extent(ordered);
983 unlock_extent(&BTRFS_I(inode)->io_tree,
984 start_pos, last_pos - 1, GFP_NOFS);
985 for (i = 0; i < num_pages; i++) {
986 unlock_page(pages[i]);
987 page_cache_release(pages[i]);
988 }
989 btrfs_wait_ordered_range(inode, start_pos,
990 last_pos - start_pos);
991 goto again;
992 }
993 if (ordered)
994 btrfs_put_ordered_extent(ordered);
995
996 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
997 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
998 GFP_NOFS);
999 unlock_extent(&BTRFS_I(inode)->io_tree,
1000 start_pos, last_pos - 1, GFP_NOFS);
1001 }
1002 for (i = 0; i < num_pages; i++) {
1003 clear_page_dirty_for_io(pages[i]);
1004 set_page_extent_mapped(pages[i]);
1005 WARN_ON(!PageLocked(pages[i]));
1006 }
1007 return 0;
1008}
1009
1010static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1011 size_t count, loff_t *ppos)
1012{
1013 loff_t pos;
1014 loff_t start_pos;
1015 ssize_t num_written = 0;
1016 ssize_t err = 0;
1017 int ret = 0;
1018 struct inode *inode = fdentry(file)->d_inode;
1019 struct btrfs_root *root = BTRFS_I(inode)->root;
1020 struct page **pages = NULL;
1021 int nrptrs;
1022 struct page *pinned[2];
1023 unsigned long first_index;
1024 unsigned long last_index;
1025 int will_write;
1026
1027 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1028 (file->f_flags & O_DIRECT));
1029
1030 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1031 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1032 pinned[0] = NULL;
1033 pinned[1] = NULL;
1034
1035 pos = *ppos;
1036 start_pos = pos;
1037
1038 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1039 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1040 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1041 if (err)
1042 goto out_nolock;
1043 if (count == 0)
1044 goto out_nolock;
1045
1046 err = file_remove_suid(file);
1047 if (err)
1048 goto out_nolock;
1049 file_update_time(file);
1050
1051 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1052
1053 mutex_lock(&inode->i_mutex);
1054 BTRFS_I(inode)->sequence++;
1055 first_index = pos >> PAGE_CACHE_SHIFT;
1056 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1057
1058 /*
1059 * there are lots of better ways to do this, but this code
1060 * makes sure the first and last page in the file range are
1061 * up to date and ready for cow
1062 */
1063 if ((pos & (PAGE_CACHE_SIZE - 1))) {
1064 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1065 if (!PageUptodate(pinned[0])) {
1066 ret = btrfs_readpage(NULL, pinned[0]);
1067 BUG_ON(ret);
1068 wait_on_page_locked(pinned[0]);
1069 } else {
1070 unlock_page(pinned[0]);
1071 }
1072 }
1073 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1074 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1075 if (!PageUptodate(pinned[1])) {
1076 ret = btrfs_readpage(NULL, pinned[1]);
1077 BUG_ON(ret);
1078 wait_on_page_locked(pinned[1]);
1079 } else {
1080 unlock_page(pinned[1]);
1081 }
1082 }
1083
1084 while (count > 0) {
1085 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1086 size_t write_bytes = min(count, nrptrs *
1087 (size_t)PAGE_CACHE_SIZE -
1088 offset);
1089 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1090 PAGE_CACHE_SHIFT;
1091
1092 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094
1095 ret = btrfs_check_free_space(root, write_bytes, 0);
1096 if (ret)
1097 goto out;
1098
1099 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index,
1101 write_bytes);
1102 if (ret)
1103 goto out;
1104
1105 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf);
1107 if (ret) {
1108 btrfs_drop_pages(pages, num_pages);
1109 goto out;
1110 }
1111
1112 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages);
1115 if (ret)
1116 goto out;
1117
1118 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos,
1120 pos + write_bytes - 1,
1121 WB_SYNC_NONE);
1122 } else {
1123 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1124 num_pages);
1125 if (num_pages <
1126 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1127 btrfs_btree_balance_dirty(root, 1);
1128 btrfs_throttle(root);
1129 }
1130
1131 buf += write_bytes;
1132 count -= write_bytes;
1133 pos += write_bytes;
1134 num_written += write_bytes;
1135
1136 cond_resched();
1137 }
1138out:
1139 mutex_unlock(&inode->i_mutex);
1140
1141out_nolock:
1142 kfree(pages);
1143 if (pinned[0])
1144 page_cache_release(pinned[0]);
1145 if (pinned[1])
1146 page_cache_release(pinned[1]);
1147 *ppos = pos;
1148
1149 if (num_written > 0 && will_write) {
1150 struct btrfs_trans_handle *trans;
1151
1152 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1153 if (err)
1154 num_written = err;
1155
1156 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1157 trans = btrfs_start_transaction(root, 1);
1158 ret = btrfs_log_dentry_safe(trans, root,
1159 file->f_dentry);
1160 if (ret == 0) {
1161 btrfs_sync_log(trans, root);
1162 btrfs_end_transaction(trans, root);
1163 } else {
1164 btrfs_commit_transaction(trans, root);
1165 }
1166 }
1167 if (file->f_flags & O_DIRECT) {
1168 invalidate_mapping_pages(inode->i_mapping,
1169 start_pos >> PAGE_CACHE_SHIFT,
1170 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1171 }
1172 }
1173 current->backing_dev_info = NULL;
1174 return num_written ? num_written : err;
1175}
1176
1177int btrfs_release_file(struct inode *inode, struct file *filp)
1178{
1179 if (filp->private_data)
1180 btrfs_ioctl_trans_end(filp);
1181 return 0;
1182}
1183
1184/*
1185 * fsync call for both files and directories. This logs the inode into
1186 * the tree log instead of forcing full commits whenever possible.
1187 *
1188 * It needs to call filemap_fdatawait so that all ordered extent updates are
1189 * in the metadata btree are up to date for copying to the log.
1190 *
1191 * It drops the inode mutex before doing the tree log commit. This is an
1192 * important optimization for directories because holding the mutex prevents
1193 * new operations on the dir while we write to disk.
1194 */
1195int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1196{
1197 struct inode *inode = dentry->d_inode;
1198 struct btrfs_root *root = BTRFS_I(inode)->root;
1199 int ret = 0;
1200 struct btrfs_trans_handle *trans;
1201
1202 /*
1203 * check the transaction that last modified this inode
1204 * and see if its already been committed
1205 */
1206 if (!BTRFS_I(inode)->last_trans)
1207 goto out;
1208
1209 mutex_lock(&root->fs_info->trans_mutex);
1210 if (BTRFS_I(inode)->last_trans <=
1211 root->fs_info->last_trans_committed) {
1212 BTRFS_I(inode)->last_trans = 0;
1213 mutex_unlock(&root->fs_info->trans_mutex);
1214 goto out;
1215 }
1216 mutex_unlock(&root->fs_info->trans_mutex);
1217
1218 root->fs_info->tree_log_batch++;
1219 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++;
1222
1223 /*
1224 * ok we haven't committed the transaction yet, lets do a commit
1225 */
1226 if (file->private_data)
1227 btrfs_ioctl_trans_end(file);
1228
1229 trans = btrfs_start_transaction(root, 1);
1230 if (!trans) {
1231 ret = -ENOMEM;
1232 goto out;
1233 }
1234
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1236 if (ret < 0)
1237 goto out;
1238
1239 /* we've logged all the items and now have a consistent
1240 * version of the file in the log. It is possible that
1241 * someone will come in and modify the file, but that's
1242 * fine because the log is consistent on disk, and we
1243 * have references to all of the file's extents
1244 *
1245 * It is possible that someone will come in and log the
1246 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe.
1248 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1250
1251 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root);
1253 } else {
1254 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root);
1256 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1258out:
1259 return ret > 0 ? EIO : ret;
1260}
1261
1262static struct vm_operations_struct btrfs_file_vm_ops = {
1263 .fault = filemap_fault,
1264 .page_mkwrite = btrfs_page_mkwrite,
1265};
1266
1267static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1268{
1269 vma->vm_ops = &btrfs_file_vm_ops;
1270 file_accessed(filp);
1271 return 0;
1272}
1273
1274struct file_operations btrfs_file_operations = {
1275 .llseek = generic_file_llseek,
1276 .read = do_sync_read,
1277 .aio_read = generic_file_aio_read,
1278 .splice_read = generic_file_splice_read,
1279 .write = btrfs_file_write,
1280 .mmap = btrfs_file_mmap,
1281 .open = generic_file_open,
1282 .release = btrfs_release_file,
1283 .fsync = btrfs_sync_file,
1284 .unlocked_ioctl = btrfs_ioctl,
1285#ifdef CONFIG_COMPAT
1286 .compat_ioctl = btrfs_ioctl,
1287#endif
1288};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1);
207 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1);
209
210 if (right_info && right_info->offset == offset+bytes) {
211 unlink_free_space(block_group, right_info);
212 info = right_info;
213 info->offset = offset;
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 }
225
226 if (left_info) {
227 unlink_free_space(block_group, left_info);
228
229 if (unlikely((left_info->offset + left_info->bytes) !=
230 offset)) {
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 }
251
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info);
265 if (ret)
266 kfree(info);
267out:
268 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST)
271 BUG();
272 }
273
274 kfree(alloc_info);
275
276 return ret;
277}
278
279static int
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
281 u64 offset, u64 bytes)
282{
283 struct btrfs_free_space *info;
284 int ret = 0;
285
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1);
288
289 if (info && info->offset == offset) {
290 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu,"
292 "trying to use %llu\n",
293 (unsigned long long)info->offset,
294 (unsigned long long)info->bytes,
295 (unsigned long long)bytes);
296 WARN_ON(1);
297 ret = -EINVAL;
298 goto out;
299 }
300 unlink_free_space(block_group, info);
301
302 if (info->bytes == bytes) {
303 kfree(info);
304 goto out;
305 }
306
307 info->offset += bytes;
308 info->bytes -= bytes;
309
310 ret = link_free_space(block_group, info);
311 BUG_ON(ret);
312 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) {
314 u64 old_start = info->offset;
315 /*
316 * we're freeing space in the middle of the info,
317 * this can happen during tree log replay
318 *
319 * first unlink the old info and then
320 * insert it again after the hole we're creating
321 */
322 unlink_free_space(block_group, info);
323 if (offset + bytes < info->offset + info->bytes) {
324 u64 old_end = info->offset + info->bytes;
325
326 info->offset = offset + bytes;
327 info->bytes = old_end - info->offset;
328 ret = link_free_space(block_group, info);
329 BUG_ON(ret);
330 } else {
331 /* the hole we're creating ends at the end
332 * of the info struct, just free the info
333 */
334 kfree(info);
335 }
336
337 /* step two, insert a new info struct to cover anything
338 * before the hole
339 */
340 ret = __btrfs_add_free_space(block_group, old_start,
341 offset - old_start);
342 BUG_ON(ret);
343 } else {
344 WARN_ON(1);
345 }
346out:
347 return ret;
348}
349
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes)
402{
403 struct btrfs_free_space *info;
404 struct rb_node *n;
405 int count = 0;
406
407 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
408 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes)
410 count++;
411 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count);
414}
415
416u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
417{
418 struct btrfs_free_space *info;
419 struct rb_node *n;
420 u64 ret = 0;
421
422 for (n = rb_first(&block_group->free_space_offset); n;
423 n = rb_next(n)) {
424 info = rb_entry(n, struct btrfs_free_space, offset_index);
425 ret += info->bytes;
426 }
427
428 return ret;
429}
430
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{
433 struct btrfs_free_space *info;
434 struct rb_node *node;
435
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info);
440 kfree(info);
441 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex);
443 cond_resched();
444 mutex_lock(&block_group->alloc_mutex);
445 }
446 }
447 mutex_unlock(&block_group->alloc_mutex);
448}
449
450#if 0
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{
456 struct btrfs_free_space *ret;
457
458 mutex_lock(&block_group->alloc_mutex);
459 ret = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0);
461 mutex_unlock(&block_group->alloc_mutex);
462
463 return ret;
464}
465
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
467 btrfs_block_group_cache
468 *block_group, u64 offset,
469 u64 bytes)
470{
471 struct btrfs_free_space *ret;
472
473 mutex_lock(&block_group->alloc_mutex);
474
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
476 mutex_unlock(&block_group->alloc_mutex);
477
478 return ret;
479}
480#endif
481
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
483 *block_group, u64 offset,
484 u64 bytes)
485{
486 struct btrfs_free_space *ret = NULL;
487
488 ret = tree_search_offset(&block_group->free_space_offset, offset,
489 bytes, 0);
490 if (!ret)
491 ret = tree_search_bytes(&block_group->free_space_bytes,
492 offset, bytes);
493
494 return ret;
495}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23static int find_name_in_backref(struct btrfs_path *path, const char *name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 BUG_ON(1);
133found:
134 btrfs_release_path(root, path);
135 btrfs_free_path(path);
136 BUG_ON(*objectid < search_start);
137 mutex_unlock(&root->objectid_mutex);
138 return 0;
139error:
140 btrfs_release_path(root, path);
141 btrfs_free_path(path);
142 mutex_unlock(&root->objectid_mutex);
143 return ret;
144}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..8adfe059ab41
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <linux/falloc.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h"
50#include "xattr.h"
51#include "tree-log.h"
52#include "ref-cache.h"
53#include "compression.h"
54
55struct btrfs_iget_args {
56 u64 ino;
57 struct btrfs_root *root;
58};
59
60static struct inode_operations btrfs_dir_inode_operations;
61static struct inode_operations btrfs_symlink_inode_operations;
62static struct inode_operations btrfs_dir_ro_inode_operations;
63static struct inode_operations btrfs_special_inode_operations;
64static struct inode_operations btrfs_file_inode_operations;
65static struct address_space_operations btrfs_aops;
66static struct address_space_operations btrfs_symlink_aops;
67static struct file_operations btrfs_dir_file_operations;
68static struct extent_io_ops btrfs_extent_io_ops;
69
70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep;
75
76#define S_SHIFT 12
77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
85};
86
87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
93
94/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{
101 u64 total;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
117 ret = -ENOSPC;
118 spin_unlock(&root->fs_info->delalloc_lock);
119 return ret;
120}
121
122/*
123 * this does all the hard work for inserting an inline extent into
124 * the btree. The caller should have done a btrfs_drop_extents so that
125 * no overlapping inline items exist in the btree
126 */
127static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
128 struct btrfs_root *root, struct inode *inode,
129 u64 start, size_t size, size_t compressed_size,
130 struct page **compressed_pages)
131{
132 struct btrfs_key key;
133 struct btrfs_path *path;
134 struct extent_buffer *leaf;
135 struct page *page = NULL;
136 char *kaddr;
137 unsigned long ptr;
138 struct btrfs_file_extent_item *ei;
139 int err = 0;
140 int ret;
141 size_t cur_size = size;
142 size_t datasize;
143 unsigned long offset;
144 int use_compress = 0;
145
146 if (compressed_size && compressed_pages) {
147 use_compress = 1;
148 cur_size = compressed_size;
149 }
150
151 path = btrfs_alloc_path();
152 if (!path)
153 return -ENOMEM;
154
155 btrfs_set_trans_block_group(trans, inode);
156
157 key.objectid = inode->i_ino;
158 key.offset = start;
159 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160 datasize = btrfs_file_extent_calc_inline_size(cur_size);
161
162 inode_add_bytes(inode, size);
163 ret = btrfs_insert_empty_item(trans, root, path, &key,
164 datasize);
165 BUG_ON(ret);
166 if (ret) {
167 err = ret;
168 goto fail;
169 }
170 leaf = path->nodes[0];
171 ei = btrfs_item_ptr(leaf, path->slots[0],
172 struct btrfs_file_extent_item);
173 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
174 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
175 btrfs_set_file_extent_encryption(leaf, ei, 0);
176 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
177 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
178 ptr = btrfs_file_extent_inline_start(ei);
179
180 if (use_compress) {
181 struct page *cpage;
182 int i = 0;
183 while (compressed_size > 0) {
184 cpage = compressed_pages[i];
185 cur_size = min_t(unsigned long, compressed_size,
186 PAGE_CACHE_SIZE);
187
188 kaddr = kmap(cpage);
189 write_extent_buffer(leaf, kaddr, ptr, cur_size);
190 kunmap(cpage);
191
192 i++;
193 ptr += cur_size;
194 compressed_size -= cur_size;
195 }
196 btrfs_set_file_extent_compression(leaf, ei,
197 BTRFS_COMPRESS_ZLIB);
198 } else {
199 page = find_get_page(inode->i_mapping,
200 start >> PAGE_CACHE_SHIFT);
201 btrfs_set_file_extent_compression(leaf, ei, 0);
202 kaddr = kmap_atomic(page, KM_USER0);
203 offset = start & (PAGE_CACHE_SIZE - 1);
204 write_extent_buffer(leaf, kaddr + offset, ptr, size);
205 kunmap_atomic(kaddr, KM_USER0);
206 page_cache_release(page);
207 }
208 btrfs_mark_buffer_dirty(leaf);
209 btrfs_free_path(path);
210
211 BTRFS_I(inode)->disk_i_size = inode->i_size;
212 btrfs_update_inode(trans, root, inode);
213 return 0;
214fail:
215 btrfs_free_path(path);
216 return err;
217}
218
219
220/*
221 * conditionally insert an inline extent into the file. This
222 * does the checks required to make sure the data is small enough
223 * to fit as an inline extent.
224 */
225static int cow_file_range_inline(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root,
227 struct inode *inode, u64 start, u64 end,
228 size_t compressed_size,
229 struct page **compressed_pages)
230{
231 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) &
235 ~((u64)root->sectorsize - 1);
236 u64 hint_byte;
237 u64 data_len = inline_len;
238 int ret;
239
240 if (compressed_size)
241 data_len = compressed_size;
242
243 if (start > 0 ||
244 actual_end >= PAGE_CACHE_SIZE ||
245 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
246 (!compressed_size &&
247 (actual_end & (root->sectorsize - 1)) == 0) ||
248 end + 1 < isize ||
249 data_len > root->fs_info->max_inline) {
250 return 1;
251 }
252
253 ret = btrfs_drop_extents(trans, root, inode, start,
254 aligned_end, start, &hint_byte);
255 BUG_ON(ret);
256
257 if (isize > actual_end)
258 inline_len = min_t(u64, isize, actual_end);
259 ret = insert_inline_extent(trans, root, inode, start,
260 inline_len, compressed_size,
261 compressed_pages);
262 BUG_ON(ret);
263 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
264 return 0;
265}
266
267struct async_extent {
268 u64 start;
269 u64 ram_size;
270 u64 compressed_size;
271 struct page **pages;
272 unsigned long nr_pages;
273 struct list_head list;
274};
275
276struct async_cow {
277 struct inode *inode;
278 struct btrfs_root *root;
279 struct page *locked_page;
280 u64 start;
281 u64 end;
282 struct list_head extents;
283 struct btrfs_work work;
284};
285
286static noinline int add_async_extent(struct async_cow *cow,
287 u64 start, u64 ram_size,
288 u64 compressed_size,
289 struct page **pages,
290 unsigned long nr_pages)
291{
292 struct async_extent *async_extent;
293
294 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
295 async_extent->start = start;
296 async_extent->ram_size = ram_size;
297 async_extent->compressed_size = compressed_size;
298 async_extent->pages = pages;
299 async_extent->nr_pages = nr_pages;
300 list_add_tail(&async_extent->list, &cow->extents);
301 return 0;
302}
303
304/*
305 * we create compressed extents in two phases. The first
306 * phase compresses a range of pages that have already been
307 * locked (both pages and state bits are locked).
308 *
309 * This is done inside an ordered work queue, and the compression
310 * is spread across many cpus. The actual IO submission is step
311 * two, and the ordered work queue takes care of making sure that
312 * happens in the same order things were put onto the queue by
313 * writepages and friends.
314 *
315 * If this code finds it can't get good compression, it puts an
316 * entry onto the work queue to write the uncompressed bytes. This
317 * makes sure that both compressed inodes and uncompressed inodes
318 * are written in the same order that pdflush sent them down.
319 */
320static noinline int compress_file_range(struct inode *inode,
321 struct page *locked_page,
322 u64 start, u64 end,
323 struct async_cow *async_cow,
324 int *num_added)
325{
326 struct btrfs_root *root = BTRFS_I(inode)->root;
327 struct btrfs_trans_handle *trans;
328 u64 num_bytes;
329 u64 orig_start;
330 u64 disk_num_bytes;
331 u64 blocksize = root->sectorsize;
332 u64 actual_end;
333 u64 isize = i_size_read(inode);
334 int ret = 0;
335 struct page **pages = NULL;
336 unsigned long nr_pages;
337 unsigned long nr_pages_ret = 0;
338 unsigned long total_compressed = 0;
339 unsigned long total_in = 0;
340 unsigned long max_compressed = 128 * 1024;
341 unsigned long max_uncompressed = 128 * 1024;
342 int i;
343 int will_compress;
344
345 orig_start = start;
346
347 actual_end = min_t(u64, isize, end + 1);
348again:
349 will_compress = 0;
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352
353 total_compressed = actual_end - start;
354
355 /* we want to make sure that amount of ram required to uncompress
356 * an extent is reasonable, so we limit the total size in ram
357 * of a compressed extent to 128k. This is a crucial number
358 * because it also controls how easily we can spread reads across
359 * cpus for decompression.
360 *
361 * We also want to make sure the amount of IO required to do
362 * a random read is reasonably small, so we limit the size of
363 * a compressed extent to 128k.
364 */
365 total_compressed = min(total_compressed, max_uncompressed);
366 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
367 num_bytes = max(blocksize, num_bytes);
368 disk_num_bytes = num_bytes;
369 total_in = 0;
370 ret = 0;
371
372 /*
373 * we do compression for mount -o compress and when the
374 * inode has not been flagged as nocompress. This flag can
375 * change at any time if we discover bad compression ratios.
376 */
377 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
378 btrfs_test_opt(root, COMPRESS)) {
379 WARN_ON(pages);
380 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
381
382 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
383 total_compressed, pages,
384 nr_pages, &nr_pages_ret,
385 &total_in,
386 &total_compressed,
387 max_compressed);
388
389 if (!ret) {
390 unsigned long offset = total_compressed &
391 (PAGE_CACHE_SIZE - 1);
392 struct page *page = pages[nr_pages_ret - 1];
393 char *kaddr;
394
395 /* zero the tail end of the last page, we might be
396 * sending it down to disk
397 */
398 if (offset) {
399 kaddr = kmap_atomic(page, KM_USER0);
400 memset(kaddr + offset, 0,
401 PAGE_CACHE_SIZE - offset);
402 kunmap_atomic(kaddr, KM_USER0);
403 }
404 will_compress = 1;
405 }
406 }
407 if (start == 0) {
408 trans = btrfs_join_transaction(root, 1);
409 BUG_ON(!trans);
410 btrfs_set_trans_block_group(trans, inode);
411
412 /* lets try to make an inline extent */
413 if (ret || total_in < (actual_end - start)) {
414 /* we didn't compress the entire range, try
415 * to make an uncompressed inline extent.
416 */
417 ret = cow_file_range_inline(trans, root, inode,
418 start, end, 0, NULL);
419 } else {
420 /* try making a compressed inline extent */
421 ret = cow_file_range_inline(trans, root, inode,
422 start, end,
423 total_compressed, pages);
424 }
425 btrfs_end_transaction(trans, root);
426 if (ret == 0) {
427 /*
428 * inline extent creation worked, we don't need
429 * to create any more async work items. Unlock
430 * and free up our temp pages.
431 */
432 extent_clear_unlock_delalloc(inode,
433 &BTRFS_I(inode)->io_tree,
434 start, end, NULL, 1, 0,
435 0, 1, 1, 1);
436 ret = 0;
437 goto free_pages_out;
438 }
439 }
440
441 if (will_compress) {
442 /*
443 * we aren't doing an inline extent round the compressed size
444 * up to a block size boundary so the allocator does sane
445 * things
446 */
447 total_compressed = (total_compressed + blocksize - 1) &
448 ~(blocksize - 1);
449
450 /*
451 * one last check to make sure the compression is really a
452 * win, compare the page count read with the blocks on disk
453 */
454 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
455 ~(PAGE_CACHE_SIZE - 1);
456 if (total_compressed >= total_in) {
457 will_compress = 0;
458 } else {
459 disk_num_bytes = total_compressed;
460 num_bytes = total_in;
461 }
462 }
463 if (!will_compress && pages) {
464 /*
465 * the compression code ran but failed to make things smaller,
466 * free any pages it allocated and our page pointer array
467 */
468 for (i = 0; i < nr_pages_ret; i++) {
469 WARN_ON(pages[i]->mapping);
470 page_cache_release(pages[i]);
471 }
472 kfree(pages);
473 pages = NULL;
474 total_compressed = 0;
475 nr_pages_ret = 0;
476
477 /* flag the file so we don't compress in the future */
478 btrfs_set_flag(inode, NOCOMPRESS);
479 }
480 if (will_compress) {
481 *num_added += 1;
482
483 /* the async work queues will take care of doing actual
484 * allocation on disk for these compressed pages,
485 * and will submit them to the elevator.
486 */
487 add_async_extent(async_cow, start, num_bytes,
488 total_compressed, pages, nr_pages_ret);
489
490 if (start + num_bytes < end && start + num_bytes < actual_end) {
491 start += num_bytes;
492 pages = NULL;
493 cond_resched();
494 goto again;
495 }
496 } else {
497 /*
498 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked
500 * page if it corresponds to our extent and set things up
501 * for the async work queue to run cow_file_range to do
502 * the normal delalloc dance
503 */
504 if (page_offset(locked_page) >= start &&
505 page_offset(locked_page) <= end) {
506 __set_page_dirty_nobuffers(locked_page);
507 /* unlocked later on in the async handlers */
508 }
509 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
510 *num_added += 1;
511 }
512
513out:
514 return 0;
515
516free_pages_out:
517 for (i = 0; i < nr_pages_ret; i++) {
518 WARN_ON(pages[i]->mapping);
519 page_cache_release(pages[i]);
520 }
521 kfree(pages);
522
523 goto out;
524}
525
526/*
527 * phase two of compressed writeback. This is the ordered portion
528 * of the code, which only gets called in the order the work was
529 * queued. We walk all the async extents created by compress_file_range
530 * and send them down to the disk.
531 */
532static noinline int submit_compressed_extents(struct inode *inode,
533 struct async_cow *async_cow)
534{
535 struct async_extent *async_extent;
536 u64 alloc_hint = 0;
537 struct btrfs_trans_handle *trans;
538 struct btrfs_key ins;
539 struct extent_map *em;
540 struct btrfs_root *root = BTRFS_I(inode)->root;
541 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
542 struct extent_io_tree *io_tree;
543 int ret;
544
545 if (list_empty(&async_cow->extents))
546 return 0;
547
548 trans = btrfs_join_transaction(root, 1);
549
550 while (!list_empty(&async_cow->extents)) {
551 async_extent = list_entry(async_cow->extents.next,
552 struct async_extent, list);
553 list_del(&async_extent->list);
554
555 io_tree = &BTRFS_I(inode)->io_tree;
556
557 /* did the compression code fall back to uncompressed IO? */
558 if (!async_extent->pages) {
559 int page_started = 0;
560 unsigned long nr_written = 0;
561
562 lock_extent(io_tree, async_extent->start,
563 async_extent->start +
564 async_extent->ram_size - 1, GFP_NOFS);
565
566 /* allocate blocks */
567 cow_file_range(inode, async_cow->locked_page,
568 async_extent->start,
569 async_extent->start +
570 async_extent->ram_size - 1,
571 &page_started, &nr_written, 0);
572
573 /*
574 * if page_started, cow_file_range inserted an
575 * inline extent and took care of all the unlocking
576 * and IO for us. Otherwise, we need to submit
577 * all those pages down to the drive.
578 */
579 if (!page_started)
580 extent_write_locked_range(io_tree,
581 inode, async_extent->start,
582 async_extent->start +
583 async_extent->ram_size - 1,
584 btrfs_get_extent,
585 WB_SYNC_ALL);
586 kfree(async_extent);
587 cond_resched();
588 continue;
589 }
590
591 lock_extent(io_tree, async_extent->start,
592 async_extent->start + async_extent->ram_size - 1,
593 GFP_NOFS);
594 /*
595 * here we're doing allocation and writeback of the
596 * compressed pages
597 */
598 btrfs_drop_extent_cache(inode, async_extent->start,
599 async_extent->start +
600 async_extent->ram_size - 1, 0);
601
602 ret = btrfs_reserve_extent(trans, root,
603 async_extent->compressed_size,
604 async_extent->compressed_size,
605 0, alloc_hint,
606 (u64)-1, &ins, 1);
607 BUG_ON(ret);
608 em = alloc_extent_map(GFP_NOFS);
609 em->start = async_extent->start;
610 em->len = async_extent->ram_size;
611 em->orig_start = em->start;
612
613 em->block_start = ins.objectid;
614 em->block_len = ins.offset;
615 em->bdev = root->fs_info->fs_devices->latest_bdev;
616 set_bit(EXTENT_FLAG_PINNED, &em->flags);
617 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
618
619 while (1) {
620 spin_lock(&em_tree->lock);
621 ret = add_extent_mapping(em_tree, em);
622 spin_unlock(&em_tree->lock);
623 if (ret != -EEXIST) {
624 free_extent_map(em);
625 break;
626 }
627 btrfs_drop_extent_cache(inode, async_extent->start,
628 async_extent->start +
629 async_extent->ram_size - 1, 0);
630 }
631
632 ret = btrfs_add_ordered_extent(inode, async_extent->start,
633 ins.objectid,
634 async_extent->ram_size,
635 ins.offset,
636 BTRFS_ORDERED_COMPRESSED);
637 BUG_ON(ret);
638
639 btrfs_end_transaction(trans, root);
640
641 /*
642 * clear dirty, set writeback and unlock the pages.
643 */
644 extent_clear_unlock_delalloc(inode,
645 &BTRFS_I(inode)->io_tree,
646 async_extent->start,
647 async_extent->start +
648 async_extent->ram_size - 1,
649 NULL, 1, 1, 0, 1, 1, 0);
650
651 ret = btrfs_submit_compressed_write(inode,
652 async_extent->start,
653 async_extent->ram_size,
654 ins.objectid,
655 ins.offset, async_extent->pages,
656 async_extent->nr_pages);
657
658 BUG_ON(ret);
659 trans = btrfs_join_transaction(root, 1);
660 alloc_hint = ins.objectid + ins.offset;
661 kfree(async_extent);
662 cond_resched();
663 }
664
665 btrfs_end_transaction(trans, root);
666 return 0;
667}
668
669/*
670 * when extent_io.c finds a delayed allocation range in the file,
671 * the call backs end up in this code. The basic idea is to
672 * allocate extents on disk for the range, and create ordered data structs
673 * in ram to track those extents.
674 *
675 * locked_page is the page that writepage had locked already. We use
676 * it to make sure we don't do extra locks or unlocks.
677 *
678 * *page_started is set to one if we unlock locked_page and do everything
679 * required to start IO on it. It may be clean and already done with
680 * IO when we return.
681 */
682static noinline int cow_file_range(struct inode *inode,
683 struct page *locked_page,
684 u64 start, u64 end, int *page_started,
685 unsigned long *nr_written,
686 int unlock)
687{
688 struct btrfs_root *root = BTRFS_I(inode)->root;
689 struct btrfs_trans_handle *trans;
690 u64 alloc_hint = 0;
691 u64 num_bytes;
692 unsigned long ram_size;
693 u64 disk_num_bytes;
694 u64 cur_alloc_size;
695 u64 blocksize = root->sectorsize;
696 u64 actual_end;
697 u64 isize = i_size_read(inode);
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
706
707 actual_end = min_t(u64, isize, end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while (disk_num_bytes > 0) {
737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
739 root->sectorsize, 0, alloc_hint,
740 (u64)-1, &ins, 1);
741 BUG_ON(ret);
742
743 em = alloc_extent_map(GFP_NOFS);
744 em->start = start;
745 em->orig_start = em->start;
746
747 ram_size = ins.offset;
748 em->len = ins.offset;
749
750 em->block_start = ins.objectid;
751 em->block_len = ins.offset;
752 em->bdev = root->fs_info->fs_devices->latest_bdev;
753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
754
755 while (1) {
756 spin_lock(&em_tree->lock);
757 ret = add_extent_mapping(em_tree, em);
758 spin_unlock(&em_tree->lock);
759 if (ret != -EEXIST) {
760 free_extent_map(em);
761 break;
762 }
763 btrfs_drop_extent_cache(inode, start,
764 start + ram_size - 1, 0);
765 }
766
767 cur_alloc_size = ins.offset;
768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
769 ram_size, cur_alloc_size, 0);
770 BUG_ON(ret);
771
772 if (root->root_key.objectid ==
773 BTRFS_DATA_RELOC_TREE_OBJECTID) {
774 ret = btrfs_reloc_clone_csums(inode, start,
775 cur_alloc_size);
776 BUG_ON(ret);
777 }
778
779 if (disk_num_bytes < cur_alloc_size)
780 break;
781
782 /* we're not doing compressed IO, don't unlock the first
783 * page (which the caller expects to stay locked), don't
784 * clear any dirty bits and don't set any writeback bits
785 */
786 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
787 start, start + ram_size - 1,
788 locked_page, unlock, 1,
789 1, 0, 0, 0);
790 disk_num_bytes -= cur_alloc_size;
791 num_bytes -= cur_alloc_size;
792 alloc_hint = ins.objectid + ins.offset;
793 start += cur_alloc_size;
794 }
795out:
796 ret = 0;
797 btrfs_end_transaction(trans, root);
798
799 return ret;
800}
801
802/*
803 * work queue call back to started compression on a file and pages
804 */
805static noinline void async_cow_start(struct btrfs_work *work)
806{
807 struct async_cow *async_cow;
808 int num_added = 0;
809 async_cow = container_of(work, struct async_cow, work);
810
811 compress_file_range(async_cow->inode, async_cow->locked_page,
812 async_cow->start, async_cow->end, async_cow,
813 &num_added);
814 if (num_added == 0)
815 async_cow->inode = NULL;
816}
817
818/*
819 * work queue call back to submit previously compressed pages
820 */
821static noinline void async_cow_submit(struct btrfs_work *work)
822{
823 struct async_cow *async_cow;
824 struct btrfs_root *root;
825 unsigned long nr_pages;
826
827 async_cow = container_of(work, struct async_cow, work);
828
829 root = async_cow->root;
830 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
831 PAGE_CACHE_SHIFT;
832
833 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
834
835 if (atomic_read(&root->fs_info->async_delalloc_pages) <
836 5 * 1042 * 1024 &&
837 waitqueue_active(&root->fs_info->async_submit_wait))
838 wake_up(&root->fs_info->async_submit_wait);
839
840 if (async_cow->inode)
841 submit_compressed_extents(async_cow->inode, async_cow);
842}
843
844static noinline void async_cow_free(struct btrfs_work *work)
845{
846 struct async_cow *async_cow;
847 async_cow = container_of(work, struct async_cow, work);
848 kfree(async_cow);
849}
850
851static int cow_file_range_async(struct inode *inode, struct page *locked_page,
852 u64 start, u64 end, int *page_started,
853 unsigned long *nr_written)
854{
855 struct async_cow *async_cow;
856 struct btrfs_root *root = BTRFS_I(inode)->root;
857 unsigned long nr_pages;
858 u64 cur_end;
859 int limit = 10 * 1024 * 1042;
860
861 if (!btrfs_test_opt(root, COMPRESS)) {
862 return cow_file_range(inode, locked_page, start, end,
863 page_started, nr_written, 1);
864 }
865
866 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
867 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
868 while (start < end) {
869 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
870 async_cow->inode = inode;
871 async_cow->root = root;
872 async_cow->locked_page = locked_page;
873 async_cow->start = start;
874
875 if (btrfs_test_flag(inode, NOCOMPRESS))
876 cur_end = end;
877 else
878 cur_end = min(end, start + 512 * 1024 - 1);
879
880 async_cow->end = cur_end;
881 INIT_LIST_HEAD(&async_cow->extents);
882
883 async_cow->work.func = async_cow_start;
884 async_cow->work.ordered_func = async_cow_submit;
885 async_cow->work.ordered_free = async_cow_free;
886 async_cow->work.flags = 0;
887
888 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
889 PAGE_CACHE_SHIFT;
890 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
891
892 btrfs_queue_worker(&root->fs_info->delalloc_workers,
893 &async_cow->work);
894
895 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
896 wait_event(root->fs_info->async_submit_wait,
897 (atomic_read(&root->fs_info->async_delalloc_pages) <
898 limit));
899 }
900
901 while (atomic_read(&root->fs_info->async_submit_draining) &&
902 atomic_read(&root->fs_info->async_delalloc_pages)) {
903 wait_event(root->fs_info->async_submit_wait,
904 (atomic_read(&root->fs_info->async_delalloc_pages) ==
905 0));
906 }
907
908 *nr_written += nr_pages;
909 start = cur_end + 1;
910 }
911 *page_started = 1;
912 return 0;
913}
914
915static noinline int csum_exist_in_range(struct btrfs_root *root,
916 u64 bytenr, u64 num_bytes)
917{
918 int ret;
919 struct btrfs_ordered_sum *sums;
920 LIST_HEAD(list);
921
922 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
923 bytenr + num_bytes - 1, &list);
924 if (ret == 0 && list_empty(&list))
925 return 0;
926
927 while (!list_empty(&list)) {
928 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
929 list_del(&sums->list);
930 kfree(sums);
931 }
932 return 1;
933}
934
935/*
936 * when nowcow writeback call back. This checks for snapshots or COW copies
937 * of the extents that exist in the file, and COWs the file as required.
938 *
939 * If no cow copies or snapshots exist, we write directly to the existing
940 * blocks on disk
941 */
942static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
943 u64 start, u64 end, int *page_started, int force,
944 unsigned long *nr_written)
945{
946 struct btrfs_root *root = BTRFS_I(inode)->root;
947 struct btrfs_trans_handle *trans;
948 struct extent_buffer *leaf;
949 struct btrfs_path *path;
950 struct btrfs_file_extent_item *fi;
951 struct btrfs_key found_key;
952 u64 cow_start;
953 u64 cur_offset;
954 u64 extent_end;
955 u64 disk_bytenr;
956 u64 num_bytes;
957 int extent_type;
958 int ret;
959 int type;
960 int nocow;
961 int check_prev = 1;
962
963 path = btrfs_alloc_path();
964 BUG_ON(!path);
965 trans = btrfs_join_transaction(root, 1);
966 BUG_ON(!trans);
967
968 cow_start = (u64)-1;
969 cur_offset = start;
970 while (1) {
971 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
972 cur_offset, 0);
973 BUG_ON(ret < 0);
974 if (ret > 0 && path->slots[0] > 0 && check_prev) {
975 leaf = path->nodes[0];
976 btrfs_item_key_to_cpu(leaf, &found_key,
977 path->slots[0] - 1);
978 if (found_key.objectid == inode->i_ino &&
979 found_key.type == BTRFS_EXTENT_DATA_KEY)
980 path->slots[0]--;
981 }
982 check_prev = 0;
983next_slot:
984 leaf = path->nodes[0];
985 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
986 ret = btrfs_next_leaf(root, path);
987 if (ret < 0)
988 BUG_ON(1);
989 if (ret > 0)
990 break;
991 leaf = path->nodes[0];
992 }
993
994 nocow = 0;
995 disk_bytenr = 0;
996 num_bytes = 0;
997 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
998
999 if (found_key.objectid > inode->i_ino ||
1000 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1001 found_key.offset > end)
1002 break;
1003
1004 if (found_key.offset > cur_offset) {
1005 extent_end = found_key.offset;
1006 goto out_check;
1007 }
1008
1009 fi = btrfs_item_ptr(leaf, path->slots[0],
1010 struct btrfs_file_extent_item);
1011 extent_type = btrfs_file_extent_type(leaf, fi);
1012
1013 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1014 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1015 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1016 extent_end = found_key.offset +
1017 btrfs_file_extent_num_bytes(leaf, fi);
1018 if (extent_end <= start) {
1019 path->slots[0]++;
1020 goto next_slot;
1021 }
1022 if (disk_bytenr == 0)
1023 goto out_check;
1024 if (btrfs_file_extent_compression(leaf, fi) ||
1025 btrfs_file_extent_encryption(leaf, fi) ||
1026 btrfs_file_extent_other_encoding(leaf, fi))
1027 goto out_check;
1028 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1029 goto out_check;
1030 if (btrfs_extent_readonly(root, disk_bytenr))
1031 goto out_check;
1032 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1033 disk_bytenr))
1034 goto out_check;
1035 disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1036 disk_bytenr += cur_offset - found_key.offset;
1037 num_bytes = min(end + 1, extent_end) - cur_offset;
1038 /*
1039 * force cow if csum exists in the range.
1040 * this ensure that csum for a given extent are
1041 * either valid or do not exist.
1042 */
1043 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1044 goto out_check;
1045 nocow = 1;
1046 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1047 extent_end = found_key.offset +
1048 btrfs_file_extent_inline_len(leaf, fi);
1049 extent_end = ALIGN(extent_end, root->sectorsize);
1050 } else {
1051 BUG_ON(1);
1052 }
1053out_check:
1054 if (extent_end <= start) {
1055 path->slots[0]++;
1056 goto next_slot;
1057 }
1058 if (!nocow) {
1059 if (cow_start == (u64)-1)
1060 cow_start = cur_offset;
1061 cur_offset = extent_end;
1062 if (cur_offset > end)
1063 break;
1064 path->slots[0]++;
1065 goto next_slot;
1066 }
1067
1068 btrfs_release_path(root, path);
1069 if (cow_start != (u64)-1) {
1070 ret = cow_file_range(inode, locked_page, cow_start,
1071 found_key.offset - 1, page_started,
1072 nr_written, 1);
1073 BUG_ON(ret);
1074 cow_start = (u64)-1;
1075 }
1076
1077 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1078 struct extent_map *em;
1079 struct extent_map_tree *em_tree;
1080 em_tree = &BTRFS_I(inode)->extent_tree;
1081 em = alloc_extent_map(GFP_NOFS);
1082 em->start = cur_offset;
1083 em->orig_start = em->start;
1084 em->len = num_bytes;
1085 em->block_len = num_bytes;
1086 em->block_start = disk_bytenr;
1087 em->bdev = root->fs_info->fs_devices->latest_bdev;
1088 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1089 while (1) {
1090 spin_lock(&em_tree->lock);
1091 ret = add_extent_mapping(em_tree, em);
1092 spin_unlock(&em_tree->lock);
1093 if (ret != -EEXIST) {
1094 free_extent_map(em);
1095 break;
1096 }
1097 btrfs_drop_extent_cache(inode, em->start,
1098 em->start + em->len - 1, 0);
1099 }
1100 type = BTRFS_ORDERED_PREALLOC;
1101 } else {
1102 type = BTRFS_ORDERED_NOCOW;
1103 }
1104
1105 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1106 num_bytes, num_bytes, type);
1107 BUG_ON(ret);
1108
1109 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1110 cur_offset, cur_offset + num_bytes - 1,
1111 locked_page, 1, 1, 1, 0, 0, 0);
1112 cur_offset = extent_end;
1113 if (cur_offset > end)
1114 break;
1115 }
1116 btrfs_release_path(root, path);
1117
1118 if (cur_offset <= end && cow_start == (u64)-1)
1119 cow_start = cur_offset;
1120 if (cow_start != (u64)-1) {
1121 ret = cow_file_range(inode, locked_page, cow_start, end,
1122 page_started, nr_written, 1);
1123 BUG_ON(ret);
1124 }
1125
1126 ret = btrfs_end_transaction(trans, root);
1127 BUG_ON(ret);
1128 btrfs_free_path(path);
1129 return 0;
1130}
1131
1132/*
1133 * extent_io.c call back to do delayed allocation processing
1134 */
1135static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1136 u64 start, u64 end, int *page_started,
1137 unsigned long *nr_written)
1138{
1139 int ret;
1140
1141 if (btrfs_test_flag(inode, NODATACOW))
1142 ret = run_delalloc_nocow(inode, locked_page, start, end,
1143 page_started, 1, nr_written);
1144 else if (btrfs_test_flag(inode, PREALLOC))
1145 ret = run_delalloc_nocow(inode, locked_page, start, end,
1146 page_started, 0, nr_written);
1147 else
1148 ret = cow_file_range_async(inode, locked_page, start, end,
1149 page_started, nr_written);
1150
1151 return ret;
1152}
1153
1154/*
1155 * extent_io.c set_bit_hook, used to track delayed allocation
1156 * bytes in this file, and to maintain the list of inodes that
1157 * have pending delalloc work to be done.
1158 */
1159static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1160 unsigned long old, unsigned long bits)
1161{
1162 /*
1163 * set_bit and clear bit hooks normally require _irqsave/restore
1164 * but in this case, we are only testeing for the DELALLOC
1165 * bit, which is only set or cleared with irqs on
1166 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root;
1169 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1;
1172 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1173 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1174 &root->fs_info->delalloc_inodes);
1175 }
1176 spin_unlock(&root->fs_info->delalloc_lock);
1177 }
1178 return 0;
1179}
1180
1181/*
1182 * extent_io.c clear_bit_hook, see set_bit_hook for why
1183 */
1184static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1185 unsigned long old, unsigned long bits)
1186{
1187 /*
1188 * set_bit and clear bit hooks normally require _irqsave/restore
1189 * but in this case, we are only testeing for the DELALLOC
1190 * bit, which is only set or cleared with irqs on
1191 */
1192 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1193 struct btrfs_root *root = BTRFS_I(inode)->root;
1194
1195 spin_lock(&root->fs_info->delalloc_lock);
1196 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1197 printk(KERN_INFO "btrfs warning: delalloc account "
1198 "%llu %llu\n",
1199 (unsigned long long)end - start + 1,
1200 (unsigned long long)
1201 root->fs_info->delalloc_bytes);
1202 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else {
1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 }
1208 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1209 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1210 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1211 }
1212 spin_unlock(&root->fs_info->delalloc_lock);
1213 }
1214 return 0;
1215}
1216
1217/*
1218 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1219 * we don't create bios that span stripes or chunks
1220 */
1221int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1222 size_t size, struct bio *bio,
1223 unsigned long bio_flags)
1224{
1225 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1226 struct btrfs_mapping_tree *map_tree;
1227 u64 logical = (u64)bio->bi_sector << 9;
1228 u64 length = 0;
1229 u64 map_length;
1230 int ret;
1231
1232 if (bio_flags & EXTENT_BIO_COMPRESSED)
1233 return 0;
1234
1235 length = bio->bi_size;
1236 map_tree = &root->fs_info->mapping_tree;
1237 map_length = length;
1238 ret = btrfs_map_block(map_tree, READ, logical,
1239 &map_length, NULL, 0);
1240
1241 if (map_length < length + size)
1242 return 1;
1243 return 0;
1244}
1245
1246/*
1247 * in order to insert checksums into the metadata in large chunks,
1248 * we wait until bio submission time. All the pages in the bio are
1249 * checksummed and sums are attached onto the ordered extent record.
1250 *
1251 * At IO completion time the cums attached on the ordered extent record
1252 * are inserted into the btree
1253 */
1254static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1255 struct bio *bio, int mirror_num,
1256 unsigned long bio_flags)
1257{
1258 struct btrfs_root *root = BTRFS_I(inode)->root;
1259 int ret = 0;
1260
1261 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1262 BUG_ON(ret);
1263 return 0;
1264}
1265
1266/*
1267 * in order to insert checksums into the metadata in large chunks,
1268 * we wait until bio submission time. All the pages in the bio are
1269 * checksummed and sums are attached onto the ordered extent record.
1270 *
1271 * At IO completion time the cums attached on the ordered extent record
1272 * are inserted into the btree
1273 */
1274static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1275 int mirror_num, unsigned long bio_flags)
1276{
1277 struct btrfs_root *root = BTRFS_I(inode)->root;
1278 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1279}
1280
1281/*
1282 * extent_io.c submission hook. This does the right thing for csum calculation
1283 * on write, or reading the csums from the tree before a read
1284 */
1285static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1286 int mirror_num, unsigned long bio_flags)
1287{
1288 struct btrfs_root *root = BTRFS_I(inode)->root;
1289 int ret = 0;
1290 int skip_sum;
1291
1292 skip_sum = btrfs_test_flag(inode, NODATASUM);
1293
1294 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1295 BUG_ON(ret);
1296
1297 if (!(rw & (1 << BIO_RW))) {
1298 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1299 return btrfs_submit_compressed_read(inode, bio,
1300 mirror_num, bio_flags);
1301 } else if (!skip_sum)
1302 btrfs_lookup_bio_sums(root, inode, bio, NULL);
1303 goto mapit;
1304 } else if (!skip_sum) {
1305 /* csum items have already been cloned */
1306 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1307 goto mapit;
1308 /* we're doing a write, do the async checksumming */
1309 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1310 inode, rw, bio, mirror_num,
1311 bio_flags, __btrfs_submit_bio_start,
1312 __btrfs_submit_bio_done);
1313 }
1314
1315mapit:
1316 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1317}
1318
1319/*
1320 * given a list of ordered sums record them in the inode. This happens
1321 * at IO completion time based on sums calculated at bio submission time.
1322 */
1323static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list)
1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum;
1329
1330 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) {
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1333 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 }
1336 return 0;
1337}
1338
1339int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1340{
1341 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1342 WARN_ON(1);
1343 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1344 GFP_NOFS);
1345}
1346
1347/* see btrfs_writepage_start_hook for details on why this is required */
1348struct btrfs_writepage_fixup {
1349 struct page *page;
1350 struct btrfs_work work;
1351};
1352
1353static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1354{
1355 struct btrfs_writepage_fixup *fixup;
1356 struct btrfs_ordered_extent *ordered;
1357 struct page *page;
1358 struct inode *inode;
1359 u64 page_start;
1360 u64 page_end;
1361
1362 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1363 page = fixup->page;
1364again:
1365 lock_page(page);
1366 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1367 ClearPageChecked(page);
1368 goto out_page;
1369 }
1370
1371 inode = page->mapping->host;
1372 page_start = page_offset(page);
1373 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1374
1375 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1376
1377 /* already ordered? We're done */
1378 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1379 EXTENT_ORDERED, 0)) {
1380 goto out;
1381 }
1382
1383 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1384 if (ordered) {
1385 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1386 page_end, GFP_NOFS);
1387 unlock_page(page);
1388 btrfs_start_ordered_extent(inode, ordered, 1);
1389 goto again;
1390 }
1391
1392 btrfs_set_extent_delalloc(inode, page_start, page_end);
1393 ClearPageChecked(page);
1394out:
1395 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1396out_page:
1397 unlock_page(page);
1398 page_cache_release(page);
1399}
1400
1401/*
1402 * There are a few paths in the higher layers of the kernel that directly
1403 * set the page dirty bit without asking the filesystem if it is a
1404 * good idea. This causes problems because we want to make sure COW
1405 * properly happens and the data=ordered rules are followed.
1406 *
1407 * In our case any range that doesn't have the ORDERED bit set
1408 * hasn't been properly setup for IO. We kick off an async process
1409 * to fix it up. The async helper will wait for ordered extents, set
1410 * the delalloc bit and make it safe to write the page.
1411 */
1412static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1413{
1414 struct inode *inode = page->mapping->host;
1415 struct btrfs_writepage_fixup *fixup;
1416 struct btrfs_root *root = BTRFS_I(inode)->root;
1417 int ret;
1418
1419 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1420 EXTENT_ORDERED, 0);
1421 if (ret)
1422 return 0;
1423
1424 if (PageChecked(page))
1425 return -EAGAIN;
1426
1427 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1428 if (!fixup)
1429 return -EAGAIN;
1430
1431 SetPageChecked(page);
1432 page_cache_get(page);
1433 fixup->work.func = btrfs_writepage_fixup_worker;
1434 fixup->page = page;
1435 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1436 return -EAGAIN;
1437}
1438
1439static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1440 struct inode *inode, u64 file_pos,
1441 u64 disk_bytenr, u64 disk_num_bytes,
1442 u64 num_bytes, u64 ram_bytes,
1443 u8 compression, u8 encryption,
1444 u16 other_encoding, int extent_type)
1445{
1446 struct btrfs_root *root = BTRFS_I(inode)->root;
1447 struct btrfs_file_extent_item *fi;
1448 struct btrfs_path *path;
1449 struct extent_buffer *leaf;
1450 struct btrfs_key ins;
1451 u64 hint;
1452 int ret;
1453
1454 path = btrfs_alloc_path();
1455 BUG_ON(!path);
1456
1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1458 file_pos + num_bytes, file_pos, &hint);
1459 BUG_ON(ret);
1460
1461 ins.objectid = inode->i_ino;
1462 ins.offset = file_pos;
1463 ins.type = BTRFS_EXTENT_DATA_KEY;
1464 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1465 BUG_ON(ret);
1466 leaf = path->nodes[0];
1467 fi = btrfs_item_ptr(leaf, path->slots[0],
1468 struct btrfs_file_extent_item);
1469 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1470 btrfs_set_file_extent_type(leaf, fi, extent_type);
1471 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1472 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1473 btrfs_set_file_extent_offset(leaf, fi, 0);
1474 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1475 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479 btrfs_mark_buffer_dirty(leaf);
1480
1481 inode_add_bytes(inode, num_bytes);
1482 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1483
1484 ins.objectid = disk_bytenr;
1485 ins.offset = disk_num_bytes;
1486 ins.type = BTRFS_EXTENT_ITEM_KEY;
1487 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1488 root->root_key.objectid,
1489 trans->transid, inode->i_ino, &ins);
1490 BUG_ON(ret);
1491
1492 btrfs_free_path(path);
1493 return 0;
1494}
1495
1496/* as ordered data IO finishes, this gets called so we can finish
1497 * an ordered extent if the range of bytes in the file it covers are
1498 * fully written.
1499 */
1500static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1501{
1502 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 struct btrfs_trans_handle *trans;
1504 struct btrfs_ordered_extent *ordered_extent;
1505 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1506 int compressed = 0;
1507 int ret;
1508
1509 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1510 if (!ret)
1511 return 0;
1512
1513 trans = btrfs_join_transaction(root, 1);
1514
1515 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1516 BUG_ON(!ordered_extent);
1517 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1518 goto nocow;
1519
1520 lock_extent(io_tree, ordered_extent->file_offset,
1521 ordered_extent->file_offset + ordered_extent->len - 1,
1522 GFP_NOFS);
1523
1524 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1525 compressed = 1;
1526 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1527 BUG_ON(compressed);
1528 ret = btrfs_mark_extent_written(trans, root, inode,
1529 ordered_extent->file_offset,
1530 ordered_extent->file_offset +
1531 ordered_extent->len);
1532 BUG_ON(ret);
1533 } else {
1534 ret = insert_reserved_file_extent(trans, inode,
1535 ordered_extent->file_offset,
1536 ordered_extent->start,
1537 ordered_extent->disk_len,
1538 ordered_extent->len,
1539 ordered_extent->len,
1540 compressed, 0, 0,
1541 BTRFS_FILE_EXTENT_REG);
1542 BUG_ON(ret);
1543 }
1544 unlock_extent(io_tree, ordered_extent->file_offset,
1545 ordered_extent->file_offset + ordered_extent->len - 1,
1546 GFP_NOFS);
1547nocow:
1548 add_pending_csums(trans, inode, ordered_extent->file_offset,
1549 &ordered_extent->list);
1550
1551 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1552 btrfs_ordered_update_i_size(inode, ordered_extent);
1553 btrfs_update_inode(trans, root, inode);
1554 btrfs_remove_ordered_extent(inode, ordered_extent);
1555 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1556
1557 /* once for us */
1558 btrfs_put_ordered_extent(ordered_extent);
1559 /* once for the tree */
1560 btrfs_put_ordered_extent(ordered_extent);
1561
1562 btrfs_end_transaction(trans, root);
1563 return 0;
1564}
1565
1566static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1567 struct extent_state *state, int uptodate)
1568{
1569 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1570}
1571
1572/*
1573 * When IO fails, either with EIO or csum verification fails, we
1574 * try other mirrors that might have a good copy of the data. This
1575 * io_failure_record is used to record state as we go through all the
1576 * mirrors. If another mirror has good data, the page is set up to date
1577 * and things continue. If a good mirror can't be found, the original
1578 * bio end_io callback is called to indicate things have failed.
1579 */
1580struct io_failure_record {
1581 struct page *page;
1582 u64 start;
1583 u64 len;
1584 u64 logical;
1585 unsigned long bio_flags;
1586 int last_mirror;
1587};
1588
1589static int btrfs_io_failed_hook(struct bio *failed_bio,
1590 struct page *page, u64 start, u64 end,
1591 struct extent_state *state)
1592{
1593 struct io_failure_record *failrec = NULL;
1594 u64 private;
1595 struct extent_map *em;
1596 struct inode *inode = page->mapping->host;
1597 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1598 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1599 struct bio *bio;
1600 int num_copies;
1601 int ret;
1602 int rw;
1603 u64 logical;
1604
1605 ret = get_state_private(failure_tree, start, &private);
1606 if (ret) {
1607 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1608 if (!failrec)
1609 return -ENOMEM;
1610 failrec->start = start;
1611 failrec->len = end - start + 1;
1612 failrec->last_mirror = 0;
1613 failrec->bio_flags = 0;
1614
1615 spin_lock(&em_tree->lock);
1616 em = lookup_extent_mapping(em_tree, start, failrec->len);
1617 if (em->start > start || em->start + em->len < start) {
1618 free_extent_map(em);
1619 em = NULL;
1620 }
1621 spin_unlock(&em_tree->lock);
1622
1623 if (!em || IS_ERR(em)) {
1624 kfree(failrec);
1625 return -EIO;
1626 }
1627 logical = start - em->start;
1628 logical = em->block_start + logical;
1629 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1630 logical = em->block_start;
1631 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1632 }
1633 failrec->logical = logical;
1634 free_extent_map(em);
1635 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1636 EXTENT_DIRTY, GFP_NOFS);
1637 set_state_private(failure_tree, start,
1638 (u64)(unsigned long)failrec);
1639 } else {
1640 failrec = (struct io_failure_record *)(unsigned long)private;
1641 }
1642 num_copies = btrfs_num_copies(
1643 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1644 failrec->logical, failrec->len);
1645 failrec->last_mirror++;
1646 if (!state) {
1647 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1648 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1649 failrec->start,
1650 EXTENT_LOCKED);
1651 if (state && state->start != failrec->start)
1652 state = NULL;
1653 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1654 }
1655 if (!state || failrec->last_mirror > num_copies) {
1656 set_state_private(failure_tree, failrec->start, 0);
1657 clear_extent_bits(failure_tree, failrec->start,
1658 failrec->start + failrec->len - 1,
1659 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1660 kfree(failrec);
1661 return -EIO;
1662 }
1663 bio = bio_alloc(GFP_NOFS, 1);
1664 bio->bi_private = state;
1665 bio->bi_end_io = failed_bio->bi_end_io;
1666 bio->bi_sector = failrec->logical >> 9;
1667 bio->bi_bdev = failed_bio->bi_bdev;
1668 bio->bi_size = 0;
1669
1670 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1671 if (failed_bio->bi_rw & (1 << BIO_RW))
1672 rw = WRITE;
1673 else
1674 rw = READ;
1675
1676 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1677 failrec->last_mirror,
1678 failrec->bio_flags);
1679 return 0;
1680}
1681
1682/*
1683 * each time an IO finishes, we do a fast check in the IO failure tree
1684 * to see if we need to process or clean up an io_failure_record
1685 */
1686static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1687{
1688 u64 private;
1689 u64 private_failure;
1690 struct io_failure_record *failure;
1691 int ret;
1692
1693 private = 0;
1694 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1695 (u64)-1, 1, EXTENT_DIRTY)) {
1696 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1697 start, &private_failure);
1698 if (ret == 0) {
1699 failure = (struct io_failure_record *)(unsigned long)
1700 private_failure;
1701 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1702 failure->start, 0);
1703 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1704 failure->start,
1705 failure->start + failure->len - 1,
1706 EXTENT_DIRTY | EXTENT_LOCKED,
1707 GFP_NOFS);
1708 kfree(failure);
1709 }
1710 }
1711 return 0;
1712}
1713
1714/*
1715 * when reads are done, we need to check csums to verify the data is correct
1716 * if there's a match, we allow the bio to finish. If not, we go through
1717 * the io_failure_record routines to find good copies
1718 */
1719static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1720 struct extent_state *state)
1721{
1722 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1723 struct inode *inode = page->mapping->host;
1724 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1725 char *kaddr;
1726 u64 private = ~(u32)0;
1727 int ret;
1728 struct btrfs_root *root = BTRFS_I(inode)->root;
1729 u32 csum = ~(u32)0;
1730
1731 if (PageChecked(page)) {
1732 ClearPageChecked(page);
1733 goto good;
1734 }
1735 if (btrfs_test_flag(inode, NODATASUM))
1736 return 0;
1737
1738 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1739 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1740 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1741 GFP_NOFS);
1742 return 0;
1743 }
1744
1745 if (state && state->start == start) {
1746 private = state->private;
1747 ret = 0;
1748 } else {
1749 ret = get_state_private(io_tree, start, &private);
1750 }
1751 kaddr = kmap_atomic(page, KM_USER0);
1752 if (ret)
1753 goto zeroit;
1754
1755 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
1756 btrfs_csum_final(csum, (char *)&csum);
1757 if (csum != private)
1758 goto zeroit;
1759
1760 kunmap_atomic(kaddr, KM_USER0);
1761good:
1762 /* if the io failure tree for this inode is non-empty,
1763 * check to see if we've recovered from a failed IO
1764 */
1765 btrfs_clean_io_failures(inode, start);
1766 return 0;
1767
1768zeroit:
1769 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1770 "private %llu\n", page->mapping->host->i_ino,
1771 (unsigned long long)start, csum,
1772 (unsigned long long)private);
1773 memset(kaddr + offset, 1, end - start + 1);
1774 flush_dcache_page(page);
1775 kunmap_atomic(kaddr, KM_USER0);
1776 if (private == 0)
1777 return 0;
1778 return -EIO;
1779}
1780
1781/*
1782 * This creates an orphan entry for the given inode in case something goes
1783 * wrong in the middle of an unlink/truncate.
1784 */
1785int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1786{
1787 struct btrfs_root *root = BTRFS_I(inode)->root;
1788 int ret = 0;
1789
1790 spin_lock(&root->list_lock);
1791
1792 /* already on the orphan list, we're good */
1793 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1794 spin_unlock(&root->list_lock);
1795 return 0;
1796 }
1797
1798 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1799
1800 spin_unlock(&root->list_lock);
1801
1802 /*
1803 * insert an orphan item to track this unlinked/truncated file
1804 */
1805 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1806
1807 return ret;
1808}
1809
1810/*
1811 * We have done the truncate/delete so we can go ahead and remove the orphan
1812 * item for this particular inode.
1813 */
1814int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1815{
1816 struct btrfs_root *root = BTRFS_I(inode)->root;
1817 int ret = 0;
1818
1819 spin_lock(&root->list_lock);
1820
1821 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1822 spin_unlock(&root->list_lock);
1823 return 0;
1824 }
1825
1826 list_del_init(&BTRFS_I(inode)->i_orphan);
1827 if (!trans) {
1828 spin_unlock(&root->list_lock);
1829 return 0;
1830 }
1831
1832 spin_unlock(&root->list_lock);
1833
1834 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1835
1836 return ret;
1837}
1838
1839/*
1840 * this cleans up any orphans that may be left on the list from the last use
1841 * of this root.
1842 */
1843void btrfs_orphan_cleanup(struct btrfs_root *root)
1844{
1845 struct btrfs_path *path;
1846 struct extent_buffer *leaf;
1847 struct btrfs_item *item;
1848 struct btrfs_key key, found_key;
1849 struct btrfs_trans_handle *trans;
1850 struct inode *inode;
1851 int ret = 0, nr_unlink = 0, nr_truncate = 0;
1852
1853 path = btrfs_alloc_path();
1854 if (!path)
1855 return;
1856 path->reada = -1;
1857
1858 key.objectid = BTRFS_ORPHAN_OBJECTID;
1859 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1860 key.offset = (u64)-1;
1861
1862
1863 while (1) {
1864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1865 if (ret < 0) {
1866 printk(KERN_ERR "Error searching slot for orphan: %d"
1867 "\n", ret);
1868 break;
1869 }
1870
1871 /*
1872 * if ret == 0 means we found what we were searching for, which
1873 * is weird, but possible, so only screw with path if we didnt
1874 * find the key and see if we have stuff that matches
1875 */
1876 if (ret > 0) {
1877 if (path->slots[0] == 0)
1878 break;
1879 path->slots[0]--;
1880 }
1881
1882 /* pull out the item */
1883 leaf = path->nodes[0];
1884 item = btrfs_item_nr(leaf, path->slots[0]);
1885 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1886
1887 /* make sure the item matches what we want */
1888 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1889 break;
1890 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1891 break;
1892
1893 /* release the path since we're done with it */
1894 btrfs_release_path(root, path);
1895
1896 /*
1897 * this is where we are basically btrfs_lookup, without the
1898 * crossing root thing. we store the inode number in the
1899 * offset of the orphan item.
1900 */
1901 inode = btrfs_iget_locked(root->fs_info->sb,
1902 found_key.offset, root);
1903 if (!inode)
1904 break;
1905
1906 if (inode->i_state & I_NEW) {
1907 BTRFS_I(inode)->root = root;
1908
1909 /* have to set the location manually */
1910 BTRFS_I(inode)->location.objectid = inode->i_ino;
1911 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1912 BTRFS_I(inode)->location.offset = 0;
1913
1914 btrfs_read_locked_inode(inode);
1915 unlock_new_inode(inode);
1916 }
1917
1918 /*
1919 * add this inode to the orphan list so btrfs_orphan_del does
1920 * the proper thing when we hit it
1921 */
1922 spin_lock(&root->list_lock);
1923 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1924 spin_unlock(&root->list_lock);
1925
1926 /*
1927 * if this is a bad inode, means we actually succeeded in
1928 * removing the inode, but not the orphan record, which means
1929 * we need to manually delete the orphan since iput will just
1930 * do a destroy_inode
1931 */
1932 if (is_bad_inode(inode)) {
1933 trans = btrfs_start_transaction(root, 1);
1934 btrfs_orphan_del(trans, inode);
1935 btrfs_end_transaction(trans, root);
1936 iput(inode);
1937 continue;
1938 }
1939
1940 /* if we have links, this was a truncate, lets do that */
1941 if (inode->i_nlink) {
1942 nr_truncate++;
1943 btrfs_truncate(inode);
1944 } else {
1945 nr_unlink++;
1946 }
1947
1948 /* this will do delete_inode and everything for us */
1949 iput(inode);
1950 }
1951
1952 if (nr_unlink)
1953 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1954 if (nr_truncate)
1955 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1956
1957 btrfs_free_path(path);
1958}
1959
1960/*
1961 * read an inode from the btree into the in-memory inode
1962 */
1963void btrfs_read_locked_inode(struct inode *inode)
1964{
1965 struct btrfs_path *path;
1966 struct extent_buffer *leaf;
1967 struct btrfs_inode_item *inode_item;
1968 struct btrfs_timespec *tspec;
1969 struct btrfs_root *root = BTRFS_I(inode)->root;
1970 struct btrfs_key location;
1971 u64 alloc_group_block;
1972 u32 rdev;
1973 int ret;
1974
1975 path = btrfs_alloc_path();
1976 BUG_ON(!path);
1977 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1978
1979 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1980 if (ret)
1981 goto make_bad;
1982
1983 leaf = path->nodes[0];
1984 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1985 struct btrfs_inode_item);
1986
1987 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1988 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1989 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1990 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1991 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1992
1993 tspec = btrfs_inode_atime(inode_item);
1994 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1995 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1996
1997 tspec = btrfs_inode_mtime(inode_item);
1998 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1999 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2000
2001 tspec = btrfs_inode_ctime(inode_item);
2002 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2003 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2004
2005 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2006 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2007 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2008 inode->i_generation = BTRFS_I(inode)->generation;
2009 inode->i_rdev = 0;
2010 rdev = btrfs_inode_rdev(leaf, inode_item);
2011
2012 BTRFS_I(inode)->index_cnt = (u64)-1;
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0);
2018 btrfs_free_path(path);
2019 inode_item = NULL;
2020
2021 switch (inode->i_mode & S_IFMT) {
2022 case S_IFREG:
2023 inode->i_mapping->a_ops = &btrfs_aops;
2024 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2025 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2026 inode->i_fop = &btrfs_file_operations;
2027 inode->i_op = &btrfs_file_inode_operations;
2028 break;
2029 case S_IFDIR:
2030 inode->i_fop = &btrfs_dir_file_operations;
2031 if (root == root->fs_info->tree_root)
2032 inode->i_op = &btrfs_dir_ro_inode_operations;
2033 else
2034 inode->i_op = &btrfs_dir_inode_operations;
2035 break;
2036 case S_IFLNK:
2037 inode->i_op = &btrfs_symlink_inode_operations;
2038 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break;
2041 default:
2042 init_special_inode(inode, inode->i_mode, rdev);
2043 break;
2044 }
2045 return;
2046
2047make_bad:
2048 btrfs_free_path(path);
2049 make_bad_inode(inode);
2050}
2051
2052/*
2053 * given a leaf and an inode, copy the inode fields into the leaf
2054 */
2055static void fill_inode_item(struct btrfs_trans_handle *trans,
2056 struct extent_buffer *leaf,
2057 struct btrfs_inode_item *item,
2058 struct inode *inode)
2059{
2060 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2061 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2062 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2063 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2064 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2065
2066 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2067 inode->i_atime.tv_sec);
2068 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2069 inode->i_atime.tv_nsec);
2070
2071 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2072 inode->i_mtime.tv_sec);
2073 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2074 inode->i_mtime.tv_nsec);
2075
2076 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2077 inode->i_ctime.tv_sec);
2078 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2079 inode->i_ctime.tv_nsec);
2080
2081 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2082 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2083 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2084 btrfs_set_inode_transid(leaf, item, trans->transid);
2085 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2086 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2087 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2088}
2089
2090/*
2091 * copy everything in the in-memory inode into the btree.
2092 */
2093noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2094 struct btrfs_root *root, struct inode *inode)
2095{
2096 struct btrfs_inode_item *inode_item;
2097 struct btrfs_path *path;
2098 struct extent_buffer *leaf;
2099 int ret;
2100
2101 path = btrfs_alloc_path();
2102 BUG_ON(!path);
2103 ret = btrfs_lookup_inode(trans, root, path,
2104 &BTRFS_I(inode)->location, 1);
2105 if (ret) {
2106 if (ret > 0)
2107 ret = -ENOENT;
2108 goto failed;
2109 }
2110
2111 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item);
2114
2115 fill_inode_item(trans, leaf, inode_item, inode);
2116 btrfs_mark_buffer_dirty(leaf);
2117 btrfs_set_inode_last_trans(trans, inode);
2118 ret = 0;
2119failed:
2120 btrfs_free_path(path);
2121 return ret;
2122}
2123
2124
2125/*
2126 * unlink helper that gets used here in inode.c and in the tree logging
2127 * recovery code. It remove a link in a directory with a given name, and
2128 * also drops the back refs in the inode to the directory
2129 */
2130int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2131 struct btrfs_root *root,
2132 struct inode *dir, struct inode *inode,
2133 const char *name, int name_len)
2134{
2135 struct btrfs_path *path;
2136 int ret = 0;
2137 struct extent_buffer *leaf;
2138 struct btrfs_dir_item *di;
2139 struct btrfs_key key;
2140 u64 index;
2141
2142 path = btrfs_alloc_path();
2143 if (!path) {
2144 ret = -ENOMEM;
2145 goto err;
2146 }
2147
2148 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2149 name, name_len, -1);
2150 if (IS_ERR(di)) {
2151 ret = PTR_ERR(di);
2152 goto err;
2153 }
2154 if (!di) {
2155 ret = -ENOENT;
2156 goto err;
2157 }
2158 leaf = path->nodes[0];
2159 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2160 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2161 if (ret)
2162 goto err;
2163 btrfs_release_path(root, path);
2164
2165 ret = btrfs_del_inode_ref(trans, root, name, name_len,
2166 inode->i_ino,
2167 dir->i_ino, &index);
2168 if (ret) {
2169 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2170 "inode %lu parent %lu\n", name_len, name,
2171 inode->i_ino, dir->i_ino);
2172 goto err;
2173 }
2174
2175 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2176 index, name, name_len, -1);
2177 if (IS_ERR(di)) {
2178 ret = PTR_ERR(di);
2179 goto err;
2180 }
2181 if (!di) {
2182 ret = -ENOENT;
2183 goto err;
2184 }
2185 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2186 btrfs_release_path(root, path);
2187
2188 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2189 inode, dir->i_ino);
2190 BUG_ON(ret != 0 && ret != -ENOENT);
2191 if (ret != -ENOENT)
2192 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2193
2194 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2195 dir, index);
2196 BUG_ON(ret);
2197err:
2198 btrfs_free_path(path);
2199 if (ret)
2200 goto out;
2201
2202 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2203 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2204 btrfs_update_inode(trans, root, dir);
2205 btrfs_drop_nlink(inode);
2206 ret = btrfs_update_inode(trans, root, inode);
2207 dir->i_sb->s_dirt = 1;
2208out:
2209 return ret;
2210}
2211
2212static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2213{
2214 struct btrfs_root *root;
2215 struct btrfs_trans_handle *trans;
2216 struct inode *inode = dentry->d_inode;
2217 int ret;
2218 unsigned long nr = 0;
2219
2220 root = BTRFS_I(dir)->root;
2221
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1);
2227
2228 btrfs_set_trans_block_group(trans, dir);
2229 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2230 dentry->d_name.name, dentry->d_name.len);
2231
2232 if (inode->i_nlink == 0)
2233 ret = btrfs_orphan_add(trans, inode);
2234
2235 nr = trans->blocks_used;
2236
2237 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr);
2240 return ret;
2241}
2242
2243static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2244{
2245 struct inode *inode = dentry->d_inode;
2246 int err = 0;
2247 int ret;
2248 struct btrfs_root *root = BTRFS_I(dir)->root;
2249 struct btrfs_trans_handle *trans;
2250 unsigned long nr = 0;
2251
2252 /*
2253 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2254 * the root of a subvolume or snapshot
2255 */
2256 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2257 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2258 return -ENOTEMPTY;
2259 }
2260
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir);
2267
2268 err = btrfs_orphan_add(trans, inode);
2269 if (err)
2270 goto fail_trans;
2271
2272 /* now the directory is empty */
2273 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2274 dentry->d_name.name, dentry->d_name.len);
2275 if (!err)
2276 btrfs_i_size_write(inode, 0);
2277
2278fail_trans:
2279 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr);
2283
2284 if (ret && !err)
2285 err = ret;
2286 return err;
2287}
2288
2289#if 0
2290/*
2291 * when truncating bytes in a file, it is possible to avoid reading
2292 * the leaves that contain only checksum items. This can be the
2293 * majority of the IO required to delete a large file, but it must
2294 * be done carefully.
2295 *
2296 * The keys in the level just above the leaves are checked to make sure
2297 * the lowest key in a given leaf is a csum key, and starts at an offset
2298 * after the new size.
2299 *
2300 * Then the key for the next leaf is checked to make sure it also has
2301 * a checksum item for the same file. If it does, we know our target leaf
2302 * contains only checksum items, and it can be safely freed without reading
2303 * it.
2304 *
2305 * This is just an optimization targeted at large files. It may do
2306 * nothing. It will return 0 unless things went badly.
2307 */
2308static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2309 struct btrfs_root *root,
2310 struct btrfs_path *path,
2311 struct inode *inode, u64 new_size)
2312{
2313 struct btrfs_key key;
2314 int ret;
2315 int nritems;
2316 struct btrfs_key found_key;
2317 struct btrfs_key other_key;
2318 struct btrfs_leaf_ref *ref;
2319 u64 leaf_gen;
2320 u64 leaf_start;
2321
2322 path->lowest_level = 1;
2323 key.objectid = inode->i_ino;
2324 key.type = BTRFS_CSUM_ITEM_KEY;
2325 key.offset = new_size;
2326again:
2327 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2328 if (ret < 0)
2329 goto out;
2330
2331 if (path->nodes[1] == NULL) {
2332 ret = 0;
2333 goto out;
2334 }
2335 ret = 0;
2336 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2337 nritems = btrfs_header_nritems(path->nodes[1]);
2338
2339 if (!nritems)
2340 goto out;
2341
2342 if (path->slots[1] >= nritems)
2343 goto next_node;
2344
2345 /* did we find a key greater than anything we want to delete? */
2346 if (found_key.objectid > inode->i_ino ||
2347 (found_key.objectid == inode->i_ino && found_key.type > key.type))
2348 goto out;
2349
2350 /* we check the next key in the node to make sure the leave contains
2351 * only checksum items. This comparison doesn't work if our
2352 * leaf is the last one in the node
2353 */
2354 if (path->slots[1] + 1 >= nritems) {
2355next_node:
2356 /* search forward from the last key in the node, this
2357 * will bring us into the next node in the tree
2358 */
2359 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2360
2361 /* unlikely, but we inc below, so check to be safe */
2362 if (found_key.offset == (u64)-1)
2363 goto out;
2364
2365 /* search_forward needs a path with locks held, do the
2366 * search again for the original key. It is possible
2367 * this will race with a balance and return a path that
2368 * we could modify, but this drop is just an optimization
2369 * and is allowed to miss some leaves.
2370 */
2371 btrfs_release_path(root, path);
2372 found_key.offset++;
2373
2374 /* setup a max key for search_forward */
2375 other_key.offset = (u64)-1;
2376 other_key.type = key.type;
2377 other_key.objectid = key.objectid;
2378
2379 path->keep_locks = 1;
2380 ret = btrfs_search_forward(root, &found_key, &other_key,
2381 path, 0, 0);
2382 path->keep_locks = 0;
2383 if (ret || found_key.objectid != key.objectid ||
2384 found_key.type != key.type) {
2385 ret = 0;
2386 goto out;
2387 }
2388
2389 key.offset = found_key.offset;
2390 btrfs_release_path(root, path);
2391 cond_resched();
2392 goto again;
2393 }
2394
2395 /* we know there's one more slot after us in the tree,
2396 * read that key so we can verify it is also a checksum item
2397 */
2398 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2399
2400 if (found_key.objectid < inode->i_ino)
2401 goto next_key;
2402
2403 if (found_key.type != key.type || found_key.offset < new_size)
2404 goto next_key;
2405
2406 /*
2407 * if the key for the next leaf isn't a csum key from this objectid,
2408 * we can't be sure there aren't good items inside this leaf.
2409 * Bail out
2410 */
2411 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2412 goto out;
2413
2414 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2415 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2416 /*
2417 * it is safe to delete this leaf, it contains only
2418 * csum items from this inode at an offset >= new_size
2419 */
2420 ret = btrfs_del_leaf(trans, root, path, leaf_start);
2421 BUG_ON(ret);
2422
2423 if (root->ref_cows && leaf_gen < trans->transid) {
2424 ref = btrfs_alloc_leaf_ref(root, 0);
2425 if (ref) {
2426 ref->root_gen = root->root_key.offset;
2427 ref->bytenr = leaf_start;
2428 ref->owner = 0;
2429 ref->generation = leaf_gen;
2430 ref->nritems = 0;
2431
2432 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref);
2435 } else {
2436 WARN_ON(1);
2437 }
2438 }
2439next_key:
2440 btrfs_release_path(root, path);
2441
2442 if (other_key.objectid == inode->i_ino &&
2443 other_key.type == key.type && other_key.offset > key.offset) {
2444 key.offset = other_key.offset;
2445 cond_resched();
2446 goto again;
2447 }
2448 ret = 0;
2449out:
2450 /* fixup any changes we've made to the path */
2451 path->lowest_level = 0;
2452 path->keep_locks = 0;
2453 btrfs_release_path(root, path);
2454 return ret;
2455}
2456
2457#endif
2458
2459/*
2460 * this can truncate away extent items, csum items and directory items.
2461 * It starts at a high offset and removes keys until it can't find
2462 * any higher than new_size
2463 *
2464 * csum items that cross the new i_size are truncated to the new size
2465 * as well.
2466 *
2467 * min_type is the minimum key type to truncate down to. If set to 0, this
2468 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2469 */
2470noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2471 struct btrfs_root *root,
2472 struct inode *inode,
2473 u64 new_size, u32 min_type)
2474{
2475 int ret;
2476 struct btrfs_path *path;
2477 struct btrfs_key key;
2478 struct btrfs_key found_key;
2479 u32 found_type;
2480 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0;
2483 u64 extent_num_bytes = 0;
2484 u64 item_end = 0;
2485 u64 root_gen = 0;
2486 u64 root_owner = 0;
2487 int found_extent;
2488 int del_item;
2489 int pending_del_nr = 0;
2490 int pending_del_slot = 0;
2491 int extent_type = -1;
2492 int encoding;
2493 u64 mask = root->sectorsize - 1;
2494
2495 if (root->ref_cows)
2496 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2497 path = btrfs_alloc_path();
2498 path->reada = -1;
2499 BUG_ON(!path);
2500
2501 /* FIXME, add redo link to tree so we don't leak on crash */
2502 key.objectid = inode->i_ino;
2503 key.offset = (u64)-1;
2504 key.type = (u8)-1;
2505
2506 btrfs_init_path(path);
2507
2508search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0)
2511 goto error;
2512
2513 if (ret > 0) {
2514 /* there are no items in the tree for us to truncate, we're
2515 * done
2516 */
2517 if (path->slots[0] == 0) {
2518 ret = 0;
2519 goto error;
2520 }
2521 path->slots[0]--;
2522 }
2523
2524 while (1) {
2525 fi = NULL;
2526 leaf = path->nodes[0];
2527 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2528 found_type = btrfs_key_type(&found_key);
2529 encoding = 0;
2530
2531 if (found_key.objectid != inode->i_ino)
2532 break;
2533
2534 if (found_type < min_type)
2535 break;
2536
2537 item_end = found_key.offset;
2538 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2539 fi = btrfs_item_ptr(leaf, path->slots[0],
2540 struct btrfs_file_extent_item);
2541 extent_type = btrfs_file_extent_type(leaf, fi);
2542 encoding = btrfs_file_extent_compression(leaf, fi);
2543 encoding |= btrfs_file_extent_encryption(leaf, fi);
2544 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2545
2546 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2547 item_end +=
2548 btrfs_file_extent_num_bytes(leaf, fi);
2549 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2550 item_end += btrfs_file_extent_inline_len(leaf,
2551 fi);
2552 }
2553 item_end--;
2554 }
2555 if (item_end < new_size) {
2556 if (found_type == BTRFS_DIR_ITEM_KEY)
2557 found_type = BTRFS_INODE_ITEM_KEY;
2558 else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2559 found_type = BTRFS_EXTENT_DATA_KEY;
2560 else if (found_type == BTRFS_EXTENT_DATA_KEY)
2561 found_type = BTRFS_XATTR_ITEM_KEY;
2562 else if (found_type == BTRFS_XATTR_ITEM_KEY)
2563 found_type = BTRFS_INODE_REF_KEY;
2564 else if (found_type)
2565 found_type--;
2566 else
2567 break;
2568 btrfs_set_key_type(&key, found_type);
2569 goto next;
2570 }
2571 if (found_key.offset >= new_size)
2572 del_item = 1;
2573 else
2574 del_item = 0;
2575 found_extent = 0;
2576
2577 /* FIXME, shrink the extent if the ref count is only 1 */
2578 if (found_type != BTRFS_EXTENT_DATA_KEY)
2579 goto delete;
2580
2581 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2582 u64 num_dec;
2583 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2584 if (!del_item && !encoding) {
2585 u64 orig_num_bytes =
2586 btrfs_file_extent_num_bytes(leaf, fi);
2587 extent_num_bytes = new_size -
2588 found_key.offset + root->sectorsize - 1;
2589 extent_num_bytes = extent_num_bytes &
2590 ~((u64)root->sectorsize - 1);
2591 btrfs_set_file_extent_num_bytes(leaf, fi,
2592 extent_num_bytes);
2593 num_dec = (orig_num_bytes -
2594 extent_num_bytes);
2595 if (root->ref_cows && extent_start != 0)
2596 inode_sub_bytes(inode, num_dec);
2597 btrfs_mark_buffer_dirty(leaf);
2598 } else {
2599 extent_num_bytes =
2600 btrfs_file_extent_disk_num_bytes(leaf,
2601 fi);
2602 /* FIXME blocksize != 4096 */
2603 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2604 if (extent_start != 0) {
2605 found_extent = 1;
2606 if (root->ref_cows)
2607 inode_sub_bytes(inode, num_dec);
2608 }
2609 root_gen = btrfs_header_generation(leaf);
2610 root_owner = btrfs_header_owner(leaf);
2611 }
2612 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2613 /*
2614 * we can't truncate inline items that have had
2615 * special encodings
2616 */
2617 if (!del_item &&
2618 btrfs_file_extent_compression(leaf, fi) == 0 &&
2619 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2620 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2621 u32 size = new_size - found_key.offset;
2622
2623 if (root->ref_cows) {
2624 inode_sub_bytes(inode, item_end + 1 -
2625 new_size);
2626 }
2627 size =
2628 btrfs_file_extent_calc_inline_size(size);
2629 ret = btrfs_truncate_item(trans, root, path,
2630 size, 1);
2631 BUG_ON(ret);
2632 } else if (root->ref_cows) {
2633 inode_sub_bytes(inode, item_end + 1 -
2634 found_key.offset);
2635 }
2636 }
2637delete:
2638 if (del_item) {
2639 if (!pending_del_nr) {
2640 /* no pending yet, add ourselves */
2641 pending_del_slot = path->slots[0];
2642 pending_del_nr = 1;
2643 } else if (pending_del_nr &&
2644 path->slots[0] + 1 == pending_del_slot) {
2645 /* hop on the pending chunk */
2646 pending_del_nr++;
2647 pending_del_slot = path->slots[0];
2648 } else {
2649 BUG();
2650 }
2651 } else {
2652 break;
2653 }
2654 if (found_extent) {
2655 ret = btrfs_free_extent(trans, root, extent_start,
2656 extent_num_bytes,
2657 leaf->start, root_owner,
2658 root_gen, inode->i_ino, 0);
2659 BUG_ON(ret);
2660 }
2661next:
2662 if (path->slots[0] == 0) {
2663 if (pending_del_nr)
2664 goto del_pending;
2665 btrfs_release_path(root, path);
2666 goto search_again;
2667 }
2668
2669 path->slots[0]--;
2670 if (pending_del_nr &&
2671 path->slots[0] + 1 != pending_del_slot) {
2672 struct btrfs_key debug;
2673del_pending:
2674 btrfs_item_key_to_cpu(path->nodes[0], &debug,
2675 pending_del_slot);
2676 ret = btrfs_del_items(trans, root, path,
2677 pending_del_slot,
2678 pending_del_nr);
2679 BUG_ON(ret);
2680 pending_del_nr = 0;
2681 btrfs_release_path(root, path);
2682 goto search_again;
2683 }
2684 }
2685 ret = 0;
2686error:
2687 if (pending_del_nr) {
2688 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2689 pending_del_nr);
2690 }
2691 btrfs_free_path(path);
2692 inode->i_sb->s_dirt = 1;
2693 return ret;
2694}
2695
2696/*
2697 * taken from block_truncate_page, but does cow as it zeros out
2698 * any bytes left in the last page in the file.
2699 */
2700static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2701{
2702 struct inode *inode = mapping->host;
2703 struct btrfs_root *root = BTRFS_I(inode)->root;
2704 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2705 struct btrfs_ordered_extent *ordered;
2706 char *kaddr;
2707 u32 blocksize = root->sectorsize;
2708 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2709 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2710 struct page *page;
2711 int ret = 0;
2712 u64 page_start;
2713 u64 page_end;
2714
2715 if ((offset & (blocksize - 1)) == 0)
2716 goto out;
2717
2718 ret = -ENOMEM;
2719again:
2720 page = grab_cache_page(mapping, index);
2721 if (!page)
2722 goto out;
2723
2724 page_start = page_offset(page);
2725 page_end = page_start + PAGE_CACHE_SIZE - 1;
2726
2727 if (!PageUptodate(page)) {
2728 ret = btrfs_readpage(NULL, page);
2729 lock_page(page);
2730 if (page->mapping != mapping) {
2731 unlock_page(page);
2732 page_cache_release(page);
2733 goto again;
2734 }
2735 if (!PageUptodate(page)) {
2736 ret = -EIO;
2737 goto out_unlock;
2738 }
2739 }
2740 wait_on_page_writeback(page);
2741
2742 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2743 set_page_extent_mapped(page);
2744
2745 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2746 if (ordered) {
2747 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2748 unlock_page(page);
2749 page_cache_release(page);
2750 btrfs_start_ordered_extent(inode, ordered, 1);
2751 btrfs_put_ordered_extent(ordered);
2752 goto again;
2753 }
2754
2755 btrfs_set_extent_delalloc(inode, page_start, page_end);
2756 ret = 0;
2757 if (offset != PAGE_CACHE_SIZE) {
2758 kaddr = kmap(page);
2759 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2760 flush_dcache_page(page);
2761 kunmap(page);
2762 }
2763 ClearPageChecked(page);
2764 set_page_dirty(page);
2765 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2766
2767out_unlock:
2768 unlock_page(page);
2769 page_cache_release(page);
2770out:
2771 return ret;
2772}
2773
2774int btrfs_cont_expand(struct inode *inode, loff_t size)
2775{
2776 struct btrfs_trans_handle *trans;
2777 struct btrfs_root *root = BTRFS_I(inode)->root;
2778 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2779 struct extent_map *em;
2780 u64 mask = root->sectorsize - 1;
2781 u64 hole_start = (inode->i_size + mask) & ~mask;
2782 u64 block_end = (size + mask) & ~mask;
2783 u64 last_byte;
2784 u64 cur_offset;
2785 u64 hole_size;
2786 int err;
2787
2788 if (size <= hole_start)
2789 return 0;
2790
2791 err = btrfs_check_free_space(root, 1, 0);
2792 if (err)
2793 return err;
2794
2795 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2796
2797 while (1) {
2798 struct btrfs_ordered_extent *ordered;
2799 btrfs_wait_ordered_range(inode, hole_start,
2800 block_end - hole_start);
2801 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2802 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2803 if (!ordered)
2804 break;
2805 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2806 btrfs_put_ordered_extent(ordered);
2807 }
2808
2809 trans = btrfs_start_transaction(root, 1);
2810 btrfs_set_trans_block_group(trans, inode);
2811
2812 cur_offset = hole_start;
2813 while (1) {
2814 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2815 block_end - cur_offset, 0);
2816 BUG_ON(IS_ERR(em) || !em);
2817 last_byte = min(extent_map_end(em), block_end);
2818 last_byte = (last_byte + mask) & ~mask;
2819 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2820 u64 hint_byte = 0;
2821 hole_size = last_byte - cur_offset;
2822 err = btrfs_drop_extents(trans, root, inode,
2823 cur_offset,
2824 cur_offset + hole_size,
2825 cur_offset, &hint_byte);
2826 if (err)
2827 break;
2828 err = btrfs_insert_file_extent(trans, root,
2829 inode->i_ino, cur_offset, 0,
2830 0, hole_size, 0, hole_size,
2831 0, 0, 0);
2832 btrfs_drop_extent_cache(inode, hole_start,
2833 last_byte - 1, 0);
2834 }
2835 free_extent_map(em);
2836 cur_offset = last_byte;
2837 if (err || cur_offset >= block_end)
2838 break;
2839 }
2840
2841 btrfs_end_transaction(trans, root);
2842 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2843 return err;
2844}
2845
2846static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2847{
2848 struct inode *inode = dentry->d_inode;
2849 int err;
2850
2851 err = inode_change_ok(inode, attr);
2852 if (err)
2853 return err;
2854
2855 if (S_ISREG(inode->i_mode) &&
2856 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2857 err = btrfs_cont_expand(inode, attr->ia_size);
2858 if (err)
2859 return err;
2860 }
2861
2862 err = inode_setattr(inode, attr);
2863
2864 if (!err && ((attr->ia_valid & ATTR_MODE)))
2865 err = btrfs_acl_chmod(inode);
2866 return err;
2867}
2868
2869void btrfs_delete_inode(struct inode *inode)
2870{
2871 struct btrfs_trans_handle *trans;
2872 struct btrfs_root *root = BTRFS_I(inode)->root;
2873 unsigned long nr;
2874 int ret;
2875
2876 truncate_inode_pages(&inode->i_data, 0);
2877 if (is_bad_inode(inode)) {
2878 btrfs_orphan_del(NULL, inode);
2879 goto no_delete;
2880 }
2881 btrfs_wait_ordered_range(inode, 0, (u64)-1);
2882
2883 btrfs_i_size_write(inode, 0);
2884 trans = btrfs_join_transaction(root, 1);
2885
2886 btrfs_set_trans_block_group(trans, inode);
2887 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2888 if (ret) {
2889 btrfs_orphan_del(NULL, inode);
2890 goto no_delete_lock;
2891 }
2892
2893 btrfs_orphan_del(trans, inode);
2894
2895 nr = trans->blocks_used;
2896 clear_inode(inode);
2897
2898 btrfs_end_transaction(trans, root);
2899 btrfs_btree_balance_dirty(root, nr);
2900 return;
2901
2902no_delete_lock:
2903 nr = trans->blocks_used;
2904 btrfs_end_transaction(trans, root);
2905 btrfs_btree_balance_dirty(root, nr);
2906no_delete:
2907 clear_inode(inode);
2908}
2909
2910/*
2911 * this returns the key found in the dir entry in the location pointer.
2912 * If no dir entries were found, location->objectid is 0.
2913 */
2914static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2915 struct btrfs_key *location)
2916{
2917 const char *name = dentry->d_name.name;
2918 int namelen = dentry->d_name.len;
2919 struct btrfs_dir_item *di;
2920 struct btrfs_path *path;
2921 struct btrfs_root *root = BTRFS_I(dir)->root;
2922 int ret = 0;
2923
2924 path = btrfs_alloc_path();
2925 BUG_ON(!path);
2926
2927 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2928 namelen, 0);
2929 if (IS_ERR(di))
2930 ret = PTR_ERR(di);
2931
2932 if (!di || IS_ERR(di))
2933 goto out_err;
2934
2935 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2936out:
2937 btrfs_free_path(path);
2938 return ret;
2939out_err:
2940 location->objectid = 0;
2941 goto out;
2942}
2943
2944/*
2945 * when we hit a tree root in a directory, the btrfs part of the inode
2946 * needs to be changed to reflect the root directory of the tree root. This
2947 * is kind of like crossing a mount point.
2948 */
2949static int fixup_tree_root_location(struct btrfs_root *root,
2950 struct btrfs_key *location,
2951 struct btrfs_root **sub_root,
2952 struct dentry *dentry)
2953{
2954 struct btrfs_root_item *ri;
2955
2956 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2957 return 0;
2958 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2959 return 0;
2960
2961 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2962 dentry->d_name.name,
2963 dentry->d_name.len);
2964 if (IS_ERR(*sub_root))
2965 return PTR_ERR(*sub_root);
2966
2967 ri = &(*sub_root)->root_item;
2968 location->objectid = btrfs_root_dirid(ri);
2969 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2970 location->offset = 0;
2971
2972 return 0;
2973}
2974
2975static noinline void init_btrfs_i(struct inode *inode)
2976{
2977 struct btrfs_inode *bi = BTRFS_I(inode);
2978
2979 bi->i_acl = NULL;
2980 bi->i_default_acl = NULL;
2981
2982 bi->generation = 0;
2983 bi->sequence = 0;
2984 bi->last_trans = 0;
2985 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0;
2987 bi->disk_i_size = 0;
2988 bi->flags = 0;
2989 bi->index_cnt = (u64)-1;
2990 bi->log_dirty_trans = 0;
2991 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2992 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2993 inode->i_mapping, GFP_NOFS);
2994 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2995 inode->i_mapping, GFP_NOFS);
2996 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2997 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2998 mutex_init(&BTRFS_I(inode)->extent_mutex);
2999 mutex_init(&BTRFS_I(inode)->log_mutex);
3000}
3001
3002static int btrfs_init_locked_inode(struct inode *inode, void *p)
3003{
3004 struct btrfs_iget_args *args = p;
3005 inode->i_ino = args->ino;
3006 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root;
3008 return 0;
3009}
3010
3011static int btrfs_find_actor(struct inode *inode, void *opaque)
3012{
3013 struct btrfs_iget_args *args = opaque;
3014 return args->ino == inode->i_ino &&
3015 args->root == BTRFS_I(inode)->root;
3016}
3017
3018struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3019 struct btrfs_root *root, int wait)
3020{
3021 struct inode *inode;
3022 struct btrfs_iget_args args;
3023 args.ino = objectid;
3024 args.root = root;
3025
3026 if (wait) {
3027 inode = ilookup5(s, objectid, btrfs_find_actor,
3028 (void *)&args);
3029 } else {
3030 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3031 (void *)&args);
3032 }
3033 return inode;
3034}
3035
3036struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3037 struct btrfs_root *root)
3038{
3039 struct inode *inode;
3040 struct btrfs_iget_args args;
3041 args.ino = objectid;
3042 args.root = root;
3043
3044 inode = iget5_locked(s, objectid, btrfs_find_actor,
3045 btrfs_init_locked_inode,
3046 (void *)&args);
3047 return inode;
3048}
3049
3050/* Get an inode object given its location and corresponding root.
3051 * Returns in *is_new if the inode was read from disk
3052 */
3053struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3054 struct btrfs_root *root, int *is_new)
3055{
3056 struct inode *inode;
3057
3058 inode = btrfs_iget_locked(s, location->objectid, root);
3059 if (!inode)
3060 return ERR_PTR(-EACCES);
3061
3062 if (inode->i_state & I_NEW) {
3063 BTRFS_I(inode)->root = root;
3064 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3065 btrfs_read_locked_inode(inode);
3066 unlock_new_inode(inode);
3067 if (is_new)
3068 *is_new = 1;
3069 } else {
3070 if (is_new)
3071 *is_new = 0;
3072 }
3073
3074 return inode;
3075}
3076
3077struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3078{
3079 struct inode *inode;
3080 struct btrfs_inode *bi = BTRFS_I(dir);
3081 struct btrfs_root *root = bi->root;
3082 struct btrfs_root *sub_root = root;
3083 struct btrfs_key location;
3084 int ret, new;
3085
3086 if (dentry->d_name.len > BTRFS_NAME_LEN)
3087 return ERR_PTR(-ENAMETOOLONG);
3088
3089 ret = btrfs_inode_by_name(dir, dentry, &location);
3090
3091 if (ret < 0)
3092 return ERR_PTR(ret);
3093
3094 inode = NULL;
3095 if (location.objectid) {
3096 ret = fixup_tree_root_location(root, &location, &sub_root,
3097 dentry);
3098 if (ret < 0)
3099 return ERR_PTR(ret);
3100 if (ret > 0)
3101 return ERR_PTR(-ENOENT);
3102 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3103 if (IS_ERR(inode))
3104 return ERR_CAST(inode);
3105 }
3106 return inode;
3107}
3108
3109static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3110 struct nameidata *nd)
3111{
3112 struct inode *inode;
3113
3114 if (dentry->d_name.len > BTRFS_NAME_LEN)
3115 return ERR_PTR(-ENAMETOOLONG);
3116
3117 inode = btrfs_lookup_dentry(dir, dentry);
3118 if (IS_ERR(inode))
3119 return ERR_CAST(inode);
3120
3121 return d_splice_alias(inode, dentry);
3122}
3123
3124static unsigned char btrfs_filetype_table[] = {
3125 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3126};
3127
3128static int btrfs_real_readdir(struct file *filp, void *dirent,
3129 filldir_t filldir)
3130{
3131 struct inode *inode = filp->f_dentry->d_inode;
3132 struct btrfs_root *root = BTRFS_I(inode)->root;
3133 struct btrfs_item *item;
3134 struct btrfs_dir_item *di;
3135 struct btrfs_key key;
3136 struct btrfs_key found_key;
3137 struct btrfs_path *path;
3138 int ret;
3139 u32 nritems;
3140 struct extent_buffer *leaf;
3141 int slot;
3142 int advance;
3143 unsigned char d_type;
3144 int over = 0;
3145 u32 di_cur;
3146 u32 di_total;
3147 u32 di_len;
3148 int key_type = BTRFS_DIR_INDEX_KEY;
3149 char tmp_name[32];
3150 char *name_ptr;
3151 int name_len;
3152
3153 /* FIXME, use a real flag for deciding about the key type */
3154 if (root->fs_info->tree_root == root)
3155 key_type = BTRFS_DIR_ITEM_KEY;
3156
3157 /* special case for "." */
3158 if (filp->f_pos == 0) {
3159 over = filldir(dirent, ".", 1,
3160 1, inode->i_ino,
3161 DT_DIR);
3162 if (over)
3163 return 0;
3164 filp->f_pos = 1;
3165 }
3166 /* special case for .., just use the back ref */
3167 if (filp->f_pos == 1) {
3168 u64 pino = parent_ino(filp->f_path.dentry);
3169 over = filldir(dirent, "..", 2,
3170 2, pino, DT_DIR);
3171 if (over)
3172 return 0;
3173 filp->f_pos = 2;
3174 }
3175 path = btrfs_alloc_path();
3176 path->reada = 2;
3177
3178 btrfs_set_key_type(&key, key_type);
3179 key.offset = filp->f_pos;
3180 key.objectid = inode->i_ino;
3181
3182 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3183 if (ret < 0)
3184 goto err;
3185 advance = 0;
3186
3187 while (1) {
3188 leaf = path->nodes[0];
3189 nritems = btrfs_header_nritems(leaf);
3190 slot = path->slots[0];
3191 if (advance || slot >= nritems) {
3192 if (slot >= nritems - 1) {
3193 ret = btrfs_next_leaf(root, path);
3194 if (ret)
3195 break;
3196 leaf = path->nodes[0];
3197 nritems = btrfs_header_nritems(leaf);
3198 slot = path->slots[0];
3199 } else {
3200 slot++;
3201 path->slots[0]++;
3202 }
3203 }
3204
3205 advance = 1;
3206 item = btrfs_item_nr(leaf, slot);
3207 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3208
3209 if (found_key.objectid != key.objectid)
3210 break;
3211 if (btrfs_key_type(&found_key) != key_type)
3212 break;
3213 if (found_key.offset < filp->f_pos)
3214 continue;
3215
3216 filp->f_pos = found_key.offset;
3217
3218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3219 di_cur = 0;
3220 di_total = btrfs_item_size(leaf, item);
3221
3222 while (di_cur < di_total) {
3223 struct btrfs_key location;
3224
3225 name_len = btrfs_dir_name_len(leaf, di);
3226 if (name_len <= sizeof(tmp_name)) {
3227 name_ptr = tmp_name;
3228 } else {
3229 name_ptr = kmalloc(name_len, GFP_NOFS);
3230 if (!name_ptr) {
3231 ret = -ENOMEM;
3232 goto err;
3233 }
3234 }
3235 read_extent_buffer(leaf, name_ptr,
3236 (unsigned long)(di + 1), name_len);
3237
3238 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3239 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3240
3241 /* is this a reference to our own snapshot? If so
3242 * skip it
3243 */
3244 if (location.type == BTRFS_ROOT_ITEM_KEY &&
3245 location.objectid == root->root_key.objectid) {
3246 over = 0;
3247 goto skip;
3248 }
3249 over = filldir(dirent, name_ptr, name_len,
3250 found_key.offset, location.objectid,
3251 d_type);
3252
3253skip:
3254 if (name_ptr != tmp_name)
3255 kfree(name_ptr);
3256
3257 if (over)
3258 goto nopos;
3259 di_len = btrfs_dir_name_len(leaf, di) +
3260 btrfs_dir_data_len(leaf, di) + sizeof(*di);
3261 di_cur += di_len;
3262 di = (struct btrfs_dir_item *)((char *)di + di_len);
3263 }
3264 }
3265
3266 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3269 else
3270 filp->f_pos++;
3271nopos:
3272 ret = 0;
3273err:
3274 btrfs_free_path(path);
3275 return ret;
3276}
3277
3278int btrfs_write_inode(struct inode *inode, int wait)
3279{
3280 struct btrfs_root *root = BTRFS_I(inode)->root;
3281 struct btrfs_trans_handle *trans;
3282 int ret = 0;
3283
3284 if (root->fs_info->btree_inode == inode)
3285 return 0;
3286
3287 if (wait) {
3288 trans = btrfs_join_transaction(root, 1);
3289 btrfs_set_trans_block_group(trans, inode);
3290 ret = btrfs_commit_transaction(trans, root);
3291 }
3292 return ret;
3293}
3294
3295/*
3296 * This is somewhat expensive, updating the tree every time the
3297 * inode changes. But, it is most likely to find the inode in cache.
3298 * FIXME, needs more benchmarking...there are no reasons other than performance
3299 * to keep or drop this code.
3300 */
3301void btrfs_dirty_inode(struct inode *inode)
3302{
3303 struct btrfs_root *root = BTRFS_I(inode)->root;
3304 struct btrfs_trans_handle *trans;
3305
3306 trans = btrfs_join_transaction(root, 1);
3307 btrfs_set_trans_block_group(trans, inode);
3308 btrfs_update_inode(trans, root, inode);
3309 btrfs_end_transaction(trans, root);
3310}
3311
3312/*
3313 * find the highest existing sequence number in a directory
3314 * and then set the in-memory index_cnt variable to reflect
3315 * free sequence numbers
3316 */
3317static int btrfs_set_inode_index_count(struct inode *inode)
3318{
3319 struct btrfs_root *root = BTRFS_I(inode)->root;
3320 struct btrfs_key key, found_key;
3321 struct btrfs_path *path;
3322 struct extent_buffer *leaf;
3323 int ret;
3324
3325 key.objectid = inode->i_ino;
3326 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3327 key.offset = (u64)-1;
3328
3329 path = btrfs_alloc_path();
3330 if (!path)
3331 return -ENOMEM;
3332
3333 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3334 if (ret < 0)
3335 goto out;
3336 /* FIXME: we should be able to handle this */
3337 if (ret == 0)
3338 goto out;
3339 ret = 0;
3340
3341 /*
3342 * MAGIC NUMBER EXPLANATION:
3343 * since we search a directory based on f_pos we have to start at 2
3344 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3345 * else has to start at 2
3346 */
3347 if (path->slots[0] == 0) {
3348 BTRFS_I(inode)->index_cnt = 2;
3349 goto out;
3350 }
3351
3352 path->slots[0]--;
3353
3354 leaf = path->nodes[0];
3355 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3356
3357 if (found_key.objectid != inode->i_ino ||
3358 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3359 BTRFS_I(inode)->index_cnt = 2;
3360 goto out;
3361 }
3362
3363 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3364out:
3365 btrfs_free_path(path);
3366 return ret;
3367}
3368
3369/*
3370 * helper to find a free sequence number in a given directory. This current
3371 * code is very simple, later versions will do smarter things in the btree
3372 */
3373int btrfs_set_inode_index(struct inode *dir, u64 *index)
3374{
3375 int ret = 0;
3376
3377 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3378 ret = btrfs_set_inode_index_count(dir);
3379 if (ret)
3380 return ret;
3381 }
3382
3383 *index = BTRFS_I(dir)->index_cnt;
3384 BTRFS_I(dir)->index_cnt++;
3385
3386 return ret;
3387}
3388
3389static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3390 struct btrfs_root *root,
3391 struct inode *dir,
3392 const char *name, int name_len,
3393 u64 ref_objectid, u64 objectid,
3394 u64 alloc_hint, int mode, u64 *index)
3395{
3396 struct inode *inode;
3397 struct btrfs_inode_item *inode_item;
3398 struct btrfs_key *location;
3399 struct btrfs_path *path;
3400 struct btrfs_inode_ref *ref;
3401 struct btrfs_key key[2];
3402 u32 sizes[2];
3403 unsigned long ptr;
3404 int ret;
3405 int owner;
3406
3407 path = btrfs_alloc_path();
3408 BUG_ON(!path);
3409
3410 inode = new_inode(root->fs_info->sb);
3411 if (!inode)
3412 return ERR_PTR(-ENOMEM);
3413
3414 if (dir) {
3415 ret = btrfs_set_inode_index(dir, index);
3416 if (ret)
3417 return ERR_PTR(ret);
3418 }
3419 /*
3420 * index_cnt is ignored for everything but a dir,
3421 * btrfs_get_inode_index_count has an explanation for the magic
3422 * number
3423 */
3424 init_btrfs_i(inode);
3425 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid;
3428
3429 if (mode & S_IFDIR)
3430 owner = 0;
3431 else
3432 owner = 1;
3433 BTRFS_I(inode)->block_group =
3434 btrfs_find_block_group(root, 0, alloc_hint, owner);
3435 if ((mode & S_IFREG)) {
3436 if (btrfs_test_opt(root, NODATASUM))
3437 btrfs_set_flag(inode, NODATASUM);
3438 if (btrfs_test_opt(root, NODATACOW))
3439 btrfs_set_flag(inode, NODATACOW);
3440 }
3441
3442 key[0].objectid = objectid;
3443 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3444 key[0].offset = 0;
3445
3446 key[1].objectid = objectid;
3447 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3448 key[1].offset = ref_objectid;
3449
3450 sizes[0] = sizeof(struct btrfs_inode_item);
3451 sizes[1] = name_len + sizeof(*ref);
3452
3453 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3454 if (ret != 0)
3455 goto fail;
3456
3457 if (objectid > root->highest_inode)
3458 root->highest_inode = objectid;
3459
3460 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid();
3462 inode->i_mode = mode;
3463 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0);
3465 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3466 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3467 struct btrfs_inode_item);
3468 fill_inode_item(trans, path->nodes[0], inode_item, inode);
3469
3470 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3471 struct btrfs_inode_ref);
3472 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3473 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3474 ptr = (unsigned long)(ref + 1);
3475 write_extent_buffer(path->nodes[0], name, ptr, name_len);
3476
3477 btrfs_mark_buffer_dirty(path->nodes[0]);
3478 btrfs_free_path(path);
3479
3480 location = &BTRFS_I(inode)->location;
3481 location->objectid = objectid;
3482 location->offset = 0;
3483 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3484
3485 insert_inode_hash(inode);
3486 return inode;
3487fail:
3488 if (dir)
3489 BTRFS_I(dir)->index_cnt--;
3490 btrfs_free_path(path);
3491 return ERR_PTR(ret);
3492}
3493
3494static inline u8 btrfs_inode_type(struct inode *inode)
3495{
3496 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3497}
3498
3499/*
3500 * utility function to add 'inode' into 'parent_inode' with
3501 * a give name and a given sequence number.
3502 * if 'add_backref' is true, also insert a backref from the
3503 * inode to the parent directory.
3504 */
3505int btrfs_add_link(struct btrfs_trans_handle *trans,
3506 struct inode *parent_inode, struct inode *inode,
3507 const char *name, int name_len, int add_backref, u64 index)
3508{
3509 int ret;
3510 struct btrfs_key key;
3511 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3512
3513 key.objectid = inode->i_ino;
3514 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3515 key.offset = 0;
3516
3517 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3518 parent_inode->i_ino,
3519 &key, btrfs_inode_type(inode),
3520 index);
3521 if (ret == 0) {
3522 if (add_backref) {
3523 ret = btrfs_insert_inode_ref(trans, root,
3524 name, name_len,
3525 inode->i_ino,
3526 parent_inode->i_ino,
3527 index);
3528 }
3529 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3530 name_len * 2);
3531 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3532 ret = btrfs_update_inode(trans, root, parent_inode);
3533 }
3534 return ret;
3535}
3536
3537static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3538 struct dentry *dentry, struct inode *inode,
3539 int backref, u64 index)
3540{
3541 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3542 inode, dentry->d_name.name,
3543 dentry->d_name.len, backref, index);
3544 if (!err) {
3545 d_instantiate(dentry, inode);
3546 return 0;
3547 }
3548 if (err > 0)
3549 err = -EEXIST;
3550 return err;
3551}
3552
3553static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3554 int mode, dev_t rdev)
3555{
3556 struct btrfs_trans_handle *trans;
3557 struct btrfs_root *root = BTRFS_I(dir)->root;
3558 struct inode *inode = NULL;
3559 int err;
3560 int drop_inode = 0;
3561 u64 objectid;
3562 unsigned long nr = 0;
3563 u64 index = 0;
3564
3565 if (!new_valid_dev(rdev))
3566 return -EINVAL;
3567
3568 err = btrfs_check_free_space(root, 1, 0);
3569 if (err)
3570 goto fail;
3571
3572 trans = btrfs_start_transaction(root, 1);
3573 btrfs_set_trans_block_group(trans, dir);
3574
3575 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3576 if (err) {
3577 err = -ENOSPC;
3578 goto out_unlock;
3579 }
3580
3581 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3582 dentry->d_name.len,
3583 dentry->d_parent->d_inode->i_ino, objectid,
3584 BTRFS_I(dir)->block_group, mode, &index);
3585 err = PTR_ERR(inode);
3586 if (IS_ERR(inode))
3587 goto out_unlock;
3588
3589 err = btrfs_init_acl(inode, dir);
3590 if (err) {
3591 drop_inode = 1;
3592 goto out_unlock;
3593 }
3594
3595 btrfs_set_trans_block_group(trans, inode);
3596 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3597 if (err)
3598 drop_inode = 1;
3599 else {
3600 inode->i_op = &btrfs_special_inode_operations;
3601 init_special_inode(inode, inode->i_mode, rdev);
3602 btrfs_update_inode(trans, root, inode);
3603 }
3604 dir->i_sb->s_dirt = 1;
3605 btrfs_update_inode_block_group(trans, inode);
3606 btrfs_update_inode_block_group(trans, dir);
3607out_unlock:
3608 nr = trans->blocks_used;
3609 btrfs_end_transaction_throttle(trans, root);
3610fail:
3611 if (drop_inode) {
3612 inode_dec_link_count(inode);
3613 iput(inode);
3614 }
3615 btrfs_btree_balance_dirty(root, nr);
3616 return err;
3617}
3618
3619static int btrfs_create(struct inode *dir, struct dentry *dentry,
3620 int mode, struct nameidata *nd)
3621{
3622 struct btrfs_trans_handle *trans;
3623 struct btrfs_root *root = BTRFS_I(dir)->root;
3624 struct inode *inode = NULL;
3625 int err;
3626 int drop_inode = 0;
3627 unsigned long nr = 0;
3628 u64 objectid;
3629 u64 index = 0;
3630
3631 err = btrfs_check_free_space(root, 1, 0);
3632 if (err)
3633 goto fail;
3634 trans = btrfs_start_transaction(root, 1);
3635 btrfs_set_trans_block_group(trans, dir);
3636
3637 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3638 if (err) {
3639 err = -ENOSPC;
3640 goto out_unlock;
3641 }
3642
3643 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3644 dentry->d_name.len,
3645 dentry->d_parent->d_inode->i_ino,
3646 objectid, BTRFS_I(dir)->block_group, mode,
3647 &index);
3648 err = PTR_ERR(inode);
3649 if (IS_ERR(inode))
3650 goto out_unlock;
3651
3652 err = btrfs_init_acl(inode, dir);
3653 if (err) {
3654 drop_inode = 1;
3655 goto out_unlock;
3656 }
3657
3658 btrfs_set_trans_block_group(trans, inode);
3659 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3660 if (err)
3661 drop_inode = 1;
3662 else {
3663 inode->i_mapping->a_ops = &btrfs_aops;
3664 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3665 inode->i_fop = &btrfs_file_operations;
3666 inode->i_op = &btrfs_file_inode_operations;
3667 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3668 }
3669 dir->i_sb->s_dirt = 1;
3670 btrfs_update_inode_block_group(trans, inode);
3671 btrfs_update_inode_block_group(trans, dir);
3672out_unlock:
3673 nr = trans->blocks_used;
3674 btrfs_end_transaction_throttle(trans, root);
3675fail:
3676 if (drop_inode) {
3677 inode_dec_link_count(inode);
3678 iput(inode);
3679 }
3680 btrfs_btree_balance_dirty(root, nr);
3681 return err;
3682}
3683
3684static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3685 struct dentry *dentry)
3686{
3687 struct btrfs_trans_handle *trans;
3688 struct btrfs_root *root = BTRFS_I(dir)->root;
3689 struct inode *inode = old_dentry->d_inode;
3690 u64 index;
3691 unsigned long nr = 0;
3692 int err;
3693 int drop_inode = 0;
3694
3695 if (inode->i_nlink == 0)
3696 return -ENOENT;
3697
3698 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0);
3700 if (err)
3701 goto fail;
3702 err = btrfs_set_inode_index(dir, &index);
3703 if (err)
3704 goto fail;
3705
3706 trans = btrfs_start_transaction(root, 1);
3707
3708 btrfs_set_trans_block_group(trans, dir);
3709 atomic_inc(&inode->i_count);
3710
3711 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3712
3713 if (err)
3714 drop_inode = 1;
3715
3716 dir->i_sb->s_dirt = 1;
3717 btrfs_update_inode_block_group(trans, dir);
3718 err = btrfs_update_inode(trans, root, inode);
3719
3720 if (err)
3721 drop_inode = 1;
3722
3723 nr = trans->blocks_used;
3724 btrfs_end_transaction_throttle(trans, root);
3725fail:
3726 if (drop_inode) {
3727 inode_dec_link_count(inode);
3728 iput(inode);
3729 }
3730 btrfs_btree_balance_dirty(root, nr);
3731 return err;
3732}
3733
3734static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3735{
3736 struct inode *inode = NULL;
3737 struct btrfs_trans_handle *trans;
3738 struct btrfs_root *root = BTRFS_I(dir)->root;
3739 int err = 0;
3740 int drop_on_err = 0;
3741 u64 objectid = 0;
3742 u64 index = 0;
3743 unsigned long nr = 1;
3744
3745 err = btrfs_check_free_space(root, 1, 0);
3746 if (err)
3747 goto out_unlock;
3748
3749 trans = btrfs_start_transaction(root, 1);
3750 btrfs_set_trans_block_group(trans, dir);
3751
3752 if (IS_ERR(trans)) {
3753 err = PTR_ERR(trans);
3754 goto out_unlock;
3755 }
3756
3757 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3758 if (err) {
3759 err = -ENOSPC;
3760 goto out_unlock;
3761 }
3762
3763 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3764 dentry->d_name.len,
3765 dentry->d_parent->d_inode->i_ino, objectid,
3766 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3767 &index);
3768 if (IS_ERR(inode)) {
3769 err = PTR_ERR(inode);
3770 goto out_fail;
3771 }
3772
3773 drop_on_err = 1;
3774
3775 err = btrfs_init_acl(inode, dir);
3776 if (err)
3777 goto out_fail;
3778
3779 inode->i_op = &btrfs_dir_inode_operations;
3780 inode->i_fop = &btrfs_dir_file_operations;
3781 btrfs_set_trans_block_group(trans, inode);
3782
3783 btrfs_i_size_write(inode, 0);
3784 err = btrfs_update_inode(trans, root, inode);
3785 if (err)
3786 goto out_fail;
3787
3788 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3789 inode, dentry->d_name.name,
3790 dentry->d_name.len, 0, index);
3791 if (err)
3792 goto out_fail;
3793
3794 d_instantiate(dentry, inode);
3795 drop_on_err = 0;
3796 dir->i_sb->s_dirt = 1;
3797 btrfs_update_inode_block_group(trans, inode);
3798 btrfs_update_inode_block_group(trans, dir);
3799
3800out_fail:
3801 nr = trans->blocks_used;
3802 btrfs_end_transaction_throttle(trans, root);
3803
3804out_unlock:
3805 if (drop_on_err)
3806 iput(inode);
3807 btrfs_btree_balance_dirty(root, nr);
3808 return err;
3809}
3810
3811/* helper for btfs_get_extent. Given an existing extent in the tree,
3812 * and an extent that you want to insert, deal with overlap and insert
3813 * the new extent into the tree.
3814 */
3815static int merge_extent_mapping(struct extent_map_tree *em_tree,
3816 struct extent_map *existing,
3817 struct extent_map *em,
3818 u64 map_start, u64 map_len)
3819{
3820 u64 start_diff;
3821
3822 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3823 start_diff = map_start - em->start;
3824 em->start = map_start;
3825 em->len = map_len;
3826 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3827 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3828 em->block_start += start_diff;
3829 em->block_len -= start_diff;
3830 }
3831 return add_extent_mapping(em_tree, em);
3832}
3833
3834static noinline int uncompress_inline(struct btrfs_path *path,
3835 struct inode *inode, struct page *page,
3836 size_t pg_offset, u64 extent_offset,
3837 struct btrfs_file_extent_item *item)
3838{
3839 int ret;
3840 struct extent_buffer *leaf = path->nodes[0];
3841 char *tmp;
3842 size_t max_size;
3843 unsigned long inline_size;
3844 unsigned long ptr;
3845
3846 WARN_ON(pg_offset != 0);
3847 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3848 inline_size = btrfs_file_extent_inline_item_len(leaf,
3849 btrfs_item_nr(leaf, path->slots[0]));
3850 tmp = kmalloc(inline_size, GFP_NOFS);
3851 ptr = btrfs_file_extent_inline_start(item);
3852
3853 read_extent_buffer(leaf, tmp, ptr, inline_size);
3854
3855 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3856 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3857 inline_size, max_size);
3858 if (ret) {
3859 char *kaddr = kmap_atomic(page, KM_USER0);
3860 unsigned long copy_size = min_t(u64,
3861 PAGE_CACHE_SIZE - pg_offset,
3862 max_size - extent_offset);
3863 memset(kaddr + pg_offset, 0, copy_size);
3864 kunmap_atomic(kaddr, KM_USER0);
3865 }
3866 kfree(tmp);
3867 return 0;
3868}
3869
3870/*
3871 * a bit scary, this does extent mapping from logical file offset to the disk.
3872 * the ugly parts come from merging extents from the disk with the in-ram
3873 * representation. This gets more complex because of the data=ordered code,
3874 * where the in-ram extents might be locked pending data=ordered completion.
3875 *
3876 * This also copies inline extents directly into the page.
3877 */
3878
3879struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3880 size_t pg_offset, u64 start, u64 len,
3881 int create)
3882{
3883 int ret;
3884 int err = 0;
3885 u64 bytenr;
3886 u64 extent_start = 0;
3887 u64 extent_end = 0;
3888 u64 objectid = inode->i_ino;
3889 u32 found_type;
3890 struct btrfs_path *path = NULL;
3891 struct btrfs_root *root = BTRFS_I(inode)->root;
3892 struct btrfs_file_extent_item *item;
3893 struct extent_buffer *leaf;
3894 struct btrfs_key found_key;
3895 struct extent_map *em = NULL;
3896 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3897 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3898 struct btrfs_trans_handle *trans = NULL;
3899 int compressed;
3900
3901again:
3902 spin_lock(&em_tree->lock);
3903 em = lookup_extent_mapping(em_tree, start, len);
3904 if (em)
3905 em->bdev = root->fs_info->fs_devices->latest_bdev;
3906 spin_unlock(&em_tree->lock);
3907
3908 if (em) {
3909 if (em->start > start || em->start + em->len <= start)
3910 free_extent_map(em);
3911 else if (em->block_start == EXTENT_MAP_INLINE && page)
3912 free_extent_map(em);
3913 else
3914 goto out;
3915 }
3916 em = alloc_extent_map(GFP_NOFS);
3917 if (!em) {
3918 err = -ENOMEM;
3919 goto out;
3920 }
3921 em->bdev = root->fs_info->fs_devices->latest_bdev;
3922 em->start = EXTENT_MAP_HOLE;
3923 em->orig_start = EXTENT_MAP_HOLE;
3924 em->len = (u64)-1;
3925 em->block_len = (u64)-1;
3926
3927 if (!path) {
3928 path = btrfs_alloc_path();
3929 BUG_ON(!path);
3930 }
3931
3932 ret = btrfs_lookup_file_extent(trans, root, path,
3933 objectid, start, trans != NULL);
3934 if (ret < 0) {
3935 err = ret;
3936 goto out;
3937 }
3938
3939 if (ret != 0) {
3940 if (path->slots[0] == 0)
3941 goto not_found;
3942 path->slots[0]--;
3943 }
3944
3945 leaf = path->nodes[0];
3946 item = btrfs_item_ptr(leaf, path->slots[0],
3947 struct btrfs_file_extent_item);
3948 /* are we inside the extent that was found? */
3949 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3950 found_type = btrfs_key_type(&found_key);
3951 if (found_key.objectid != objectid ||
3952 found_type != BTRFS_EXTENT_DATA_KEY) {
3953 goto not_found;
3954 }
3955
3956 found_type = btrfs_file_extent_type(leaf, item);
3957 extent_start = found_key.offset;
3958 compressed = btrfs_file_extent_compression(leaf, item);
3959 if (found_type == BTRFS_FILE_EXTENT_REG ||
3960 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3961 extent_end = extent_start +
3962 btrfs_file_extent_num_bytes(leaf, item);
3963 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3964 size_t size;
3965 size = btrfs_file_extent_inline_len(leaf, item);
3966 extent_end = (extent_start + size + root->sectorsize - 1) &
3967 ~((u64)root->sectorsize - 1);
3968 }
3969
3970 if (start >= extent_end) {
3971 path->slots[0]++;
3972 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3973 ret = btrfs_next_leaf(root, path);
3974 if (ret < 0) {
3975 err = ret;
3976 goto out;
3977 }
3978 if (ret > 0)
3979 goto not_found;
3980 leaf = path->nodes[0];
3981 }
3982 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3983 if (found_key.objectid != objectid ||
3984 found_key.type != BTRFS_EXTENT_DATA_KEY)
3985 goto not_found;
3986 if (start + len <= found_key.offset)
3987 goto not_found;
3988 em->start = start;
3989 em->len = found_key.offset - start;
3990 goto not_found_em;
3991 }
3992
3993 if (found_type == BTRFS_FILE_EXTENT_REG ||
3994 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3995 em->start = extent_start;
3996 em->len = extent_end - extent_start;
3997 em->orig_start = extent_start -
3998 btrfs_file_extent_offset(leaf, item);
3999 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4000 if (bytenr == 0) {
4001 em->block_start = EXTENT_MAP_HOLE;
4002 goto insert;
4003 }
4004 if (compressed) {
4005 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4006 em->block_start = bytenr;
4007 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4008 item);
4009 } else {
4010 bytenr += btrfs_file_extent_offset(leaf, item);
4011 em->block_start = bytenr;
4012 em->block_len = em->len;
4013 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4014 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4015 }
4016 goto insert;
4017 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4018 unsigned long ptr;
4019 char *map;
4020 size_t size;
4021 size_t extent_offset;
4022 size_t copy_size;
4023
4024 em->block_start = EXTENT_MAP_INLINE;
4025 if (!page || create) {
4026 em->start = extent_start;
4027 em->len = extent_end - extent_start;
4028 goto out;
4029 }
4030
4031 size = btrfs_file_extent_inline_len(leaf, item);
4032 extent_offset = page_offset(page) + pg_offset - extent_start;
4033 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4034 size - extent_offset);
4035 em->start = extent_start + extent_offset;
4036 em->len = (copy_size + root->sectorsize - 1) &
4037 ~((u64)root->sectorsize - 1);
4038 em->orig_start = EXTENT_MAP_INLINE;
4039 if (compressed)
4040 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4041 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4042 if (create == 0 && !PageUptodate(page)) {
4043 if (btrfs_file_extent_compression(leaf, item) ==
4044 BTRFS_COMPRESS_ZLIB) {
4045 ret = uncompress_inline(path, inode, page,
4046 pg_offset,
4047 extent_offset, item);
4048 BUG_ON(ret);
4049 } else {
4050 map = kmap(page);
4051 read_extent_buffer(leaf, map + pg_offset, ptr,
4052 copy_size);
4053 kunmap(page);
4054 }
4055 flush_dcache_page(page);
4056 } else if (create && PageUptodate(page)) {
4057 if (!trans) {
4058 kunmap(page);
4059 free_extent_map(em);
4060 em = NULL;
4061 btrfs_release_path(root, path);
4062 trans = btrfs_join_transaction(root, 1);
4063 goto again;
4064 }
4065 map = kmap(page);
4066 write_extent_buffer(leaf, map + pg_offset, ptr,
4067 copy_size);
4068 kunmap(page);
4069 btrfs_mark_buffer_dirty(leaf);
4070 }
4071 set_extent_uptodate(io_tree, em->start,
4072 extent_map_end(em) - 1, GFP_NOFS);
4073 goto insert;
4074 } else {
4075 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4076 WARN_ON(1);
4077 }
4078not_found:
4079 em->start = start;
4080 em->len = len;
4081not_found_em:
4082 em->block_start = EXTENT_MAP_HOLE;
4083 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4084insert:
4085 btrfs_release_path(root, path);
4086 if (em->start > start || extent_map_end(em) <= start) {
4087 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4088 "[%llu %llu]\n", (unsigned long long)em->start,
4089 (unsigned long long)em->len,
4090 (unsigned long long)start,
4091 (unsigned long long)len);
4092 err = -EIO;
4093 goto out;
4094 }
4095
4096 err = 0;
4097 spin_lock(&em_tree->lock);
4098 ret = add_extent_mapping(em_tree, em);
4099 /* it is possible that someone inserted the extent into the tree
4100 * while we had the lock dropped. It is also possible that
4101 * an overlapping map exists in the tree
4102 */
4103 if (ret == -EEXIST) {
4104 struct extent_map *existing;
4105
4106 ret = 0;
4107
4108 existing = lookup_extent_mapping(em_tree, start, len);
4109 if (existing && (existing->start > start ||
4110 existing->start + existing->len <= start)) {
4111 free_extent_map(existing);
4112 existing = NULL;
4113 }
4114 if (!existing) {
4115 existing = lookup_extent_mapping(em_tree, em->start,
4116 em->len);
4117 if (existing) {
4118 err = merge_extent_mapping(em_tree, existing,
4119 em, start,
4120 root->sectorsize);
4121 free_extent_map(existing);
4122 if (err) {
4123 free_extent_map(em);
4124 em = NULL;
4125 }
4126 } else {
4127 err = -EIO;
4128 free_extent_map(em);
4129 em = NULL;
4130 }
4131 } else {
4132 free_extent_map(em);
4133 em = existing;
4134 err = 0;
4135 }
4136 }
4137 spin_unlock(&em_tree->lock);
4138out:
4139 if (path)
4140 btrfs_free_path(path);
4141 if (trans) {
4142 ret = btrfs_end_transaction(trans, root);
4143 if (!err)
4144 err = ret;
4145 }
4146 if (err) {
4147 free_extent_map(em);
4148 WARN_ON(1);
4149 return ERR_PTR(err);
4150 }
4151 return em;
4152}
4153
4154static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4155 const struct iovec *iov, loff_t offset,
4156 unsigned long nr_segs)
4157{
4158 return -EINVAL;
4159}
4160
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4162{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent);
4164}
4165
4166int btrfs_readpage(struct file *file, struct page *page)
4167{
4168 struct extent_io_tree *tree;
4169 tree = &BTRFS_I(page->mapping->host)->io_tree;
4170 return extent_read_full_page(tree, page, btrfs_get_extent);
4171}
4172
4173static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4174{
4175 struct extent_io_tree *tree;
4176
4177
4178 if (current->flags & PF_MEMALLOC) {
4179 redirty_page_for_writepage(wbc, page);
4180 unlock_page(page);
4181 return 0;
4182 }
4183 tree = &BTRFS_I(page->mapping->host)->io_tree;
4184 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4185}
4186
4187int btrfs_writepages(struct address_space *mapping,
4188 struct writeback_control *wbc)
4189{
4190 struct extent_io_tree *tree;
4191
4192 tree = &BTRFS_I(mapping->host)->io_tree;
4193 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4194}
4195
4196static int
4197btrfs_readpages(struct file *file, struct address_space *mapping,
4198 struct list_head *pages, unsigned nr_pages)
4199{
4200 struct extent_io_tree *tree;
4201 tree = &BTRFS_I(mapping->host)->io_tree;
4202 return extent_readpages(tree, mapping, pages, nr_pages,
4203 btrfs_get_extent);
4204}
4205static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4206{
4207 struct extent_io_tree *tree;
4208 struct extent_map_tree *map;
4209 int ret;
4210
4211 tree = &BTRFS_I(page->mapping->host)->io_tree;
4212 map = &BTRFS_I(page->mapping->host)->extent_tree;
4213 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4214 if (ret == 1) {
4215 ClearPagePrivate(page);
4216 set_page_private(page, 0);
4217 page_cache_release(page);
4218 }
4219 return ret;
4220}
4221
4222static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{
4224 if (PageWriteback(page) || PageDirty(page))
4225 return 0;
4226 return __btrfs_releasepage(page, gfp_flags);
4227}
4228
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4230{
4231 struct extent_io_tree *tree;
4232 struct btrfs_ordered_extent *ordered;
4233 u64 page_start = page_offset(page);
4234 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4235
4236 wait_on_page_writeback(page);
4237 tree = &BTRFS_I(page->mapping->host)->io_tree;
4238 if (offset) {
4239 btrfs_releasepage(page, GFP_NOFS);
4240 return;
4241 }
4242
4243 lock_extent(tree, page_start, page_end, GFP_NOFS);
4244 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4245 page_offset(page));
4246 if (ordered) {
4247 /*
4248 * IO on this page will never be started, so we need
4249 * to account for any ordered extents now
4250 */
4251 clear_extent_bit(tree, page_start, page_end,
4252 EXTENT_DIRTY | EXTENT_DELALLOC |
4253 EXTENT_LOCKED, 1, 0, GFP_NOFS);
4254 btrfs_finish_ordered_io(page->mapping->host,
4255 page_start, page_end);
4256 btrfs_put_ordered_extent(ordered);
4257 lock_extent(tree, page_start, page_end, GFP_NOFS);
4258 }
4259 clear_extent_bit(tree, page_start, page_end,
4260 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4261 EXTENT_ORDERED,
4262 1, 1, GFP_NOFS);
4263 __btrfs_releasepage(page, GFP_NOFS);
4264
4265 ClearPageChecked(page);
4266 if (PagePrivate(page)) {
4267 ClearPagePrivate(page);
4268 set_page_private(page, 0);
4269 page_cache_release(page);
4270 }
4271}
4272
4273/*
4274 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4275 * called from a page fault handler when a page is first dirtied. Hence we must
4276 * be careful to check for EOF conditions here. We set the page up correctly
4277 * for a written page which means we get ENOSPC checking when writing into
4278 * holes and correct delalloc and unwritten extent mapping on filesystems that
4279 * support these features.
4280 *
4281 * We are not allowed to take the i_mutex here so we have to play games to
4282 * protect against truncate races as the page could now be beyond EOF. Because
4283 * vmtruncate() writes the inode size before removing pages, once we have the
4284 * page lock we can determine safely if the page is beyond EOF. If it is not
4285 * beyond EOF, then the page is guaranteed safe against truncation until we
4286 * unlock the page.
4287 */
4288int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4289{
4290 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4291 struct btrfs_root *root = BTRFS_I(inode)->root;
4292 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4293 struct btrfs_ordered_extent *ordered;
4294 char *kaddr;
4295 unsigned long zero_start;
4296 loff_t size;
4297 int ret;
4298 u64 page_start;
4299 u64 page_end;
4300
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4302 if (ret)
4303 goto out;
4304
4305 ret = -EINVAL;
4306again:
4307 lock_page(page);
4308 size = i_size_read(inode);
4309 page_start = page_offset(page);
4310 page_end = page_start + PAGE_CACHE_SIZE - 1;
4311
4312 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) {
4314 /* page got truncated out from underneath us */
4315 goto out_unlock;
4316 }
4317 wait_on_page_writeback(page);
4318
4319 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4320 set_page_extent_mapped(page);
4321
4322 /*
4323 * we can't set the delalloc bits if there are pending ordered
4324 * extents. Drop our locks and wait for them to finish
4325 */
4326 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4327 if (ordered) {
4328 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4329 unlock_page(page);
4330 btrfs_start_ordered_extent(inode, ordered, 1);
4331 btrfs_put_ordered_extent(ordered);
4332 goto again;
4333 }
4334
4335 btrfs_set_extent_delalloc(inode, page_start, page_end);
4336 ret = 0;
4337
4338 /* page is wholly or partially inside EOF */
4339 if (page_start + PAGE_CACHE_SIZE > size)
4340 zero_start = size & ~PAGE_CACHE_MASK;
4341 else
4342 zero_start = PAGE_CACHE_SIZE;
4343
4344 if (zero_start != PAGE_CACHE_SIZE) {
4345 kaddr = kmap(page);
4346 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4347 flush_dcache_page(page);
4348 kunmap(page);
4349 }
4350 ClearPageChecked(page);
4351 set_page_dirty(page);
4352 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4353
4354out_unlock:
4355 unlock_page(page);
4356out:
4357 return ret;
4358}
4359
4360static void btrfs_truncate(struct inode *inode)
4361{
4362 struct btrfs_root *root = BTRFS_I(inode)->root;
4363 int ret;
4364 struct btrfs_trans_handle *trans;
4365 unsigned long nr;
4366 u64 mask = root->sectorsize - 1;
4367
4368 if (!S_ISREG(inode->i_mode))
4369 return;
4370 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4371 return;
4372
4373 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4374 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4375
4376 trans = btrfs_start_transaction(root, 1);
4377 btrfs_set_trans_block_group(trans, inode);
4378 btrfs_i_size_write(inode, inode->i_size);
4379
4380 ret = btrfs_orphan_add(trans, inode);
4381 if (ret)
4382 goto out;
4383 /* FIXME, add redo link to tree so we don't leak on crash */
4384 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4385 BTRFS_EXTENT_DATA_KEY);
4386 btrfs_update_inode(trans, root, inode);
4387
4388 ret = btrfs_orphan_del(trans, inode);
4389 BUG_ON(ret);
4390
4391out:
4392 nr = trans->blocks_used;
4393 ret = btrfs_end_transaction_throttle(trans, root);
4394 BUG_ON(ret);
4395 btrfs_btree_balance_dirty(root, nr);
4396}
4397
4398/*
4399 * create a new subvolume directory/inode (helper for the ioctl).
4400 */
4401int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4402 struct btrfs_root *new_root, struct dentry *dentry,
4403 u64 new_dirid, u64 alloc_hint)
4404{
4405 struct inode *inode;
4406 int error;
4407 u64 index = 0;
4408
4409 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4410 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4411 if (IS_ERR(inode))
4412 return PTR_ERR(inode);
4413 inode->i_op = &btrfs_dir_inode_operations;
4414 inode->i_fop = &btrfs_dir_file_operations;
4415
4416 inode->i_nlink = 1;
4417 btrfs_i_size_write(inode, 0);
4418
4419 error = btrfs_update_inode(trans, new_root, inode);
4420 if (error)
4421 return error;
4422
4423 d_instantiate(dentry, inode);
4424 return 0;
4425}
4426
4427/* helper function for file defrag and space balancing. This
4428 * forces readahead on a given range of bytes in an inode
4429 */
4430unsigned long btrfs_force_ra(struct address_space *mapping,
4431 struct file_ra_state *ra, struct file *file,
4432 pgoff_t offset, pgoff_t last_index)
4433{
4434 pgoff_t req_size = last_index - offset + 1;
4435
4436 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4437 return offset + req_size;
4438}
4439
4440struct inode *btrfs_alloc_inode(struct super_block *sb)
4441{
4442 struct btrfs_inode *ei;
4443
4444 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4445 if (!ei)
4446 return NULL;
4447 ei->last_trans = 0;
4448 ei->logged_trans = 0;
4449 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4450 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4451 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4452 INIT_LIST_HEAD(&ei->i_orphan);
4453 return &ei->vfs_inode;
4454}
4455
4456void btrfs_destroy_inode(struct inode *inode)
4457{
4458 struct btrfs_ordered_extent *ordered;
4459 WARN_ON(!list_empty(&inode->i_dentry));
4460 WARN_ON(inode->i_data.nrpages);
4461
4462 if (BTRFS_I(inode)->i_acl &&
4463 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4464 posix_acl_release(BTRFS_I(inode)->i_acl);
4465 if (BTRFS_I(inode)->i_default_acl &&
4466 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4467 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4468
4469 spin_lock(&BTRFS_I(inode)->root->list_lock);
4470 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4471 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4472 " list\n", inode->i_ino);
4473 dump_stack();
4474 }
4475 spin_unlock(&BTRFS_I(inode)->root->list_lock);
4476
4477 while (1) {
4478 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4479 if (!ordered)
4480 break;
4481 else {
4482 printk(KERN_ERR "btrfs found ordered "
4483 "extent %llu %llu on inode cleanup\n",
4484 (unsigned long long)ordered->file_offset,
4485 (unsigned long long)ordered->len);
4486 btrfs_remove_ordered_extent(inode, ordered);
4487 btrfs_put_ordered_extent(ordered);
4488 btrfs_put_ordered_extent(ordered);
4489 }
4490 }
4491 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4492 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4493}
4494
4495static void init_once(void *foo)
4496{
4497 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4498
4499 inode_init_once(&ei->vfs_inode);
4500}
4501
4502void btrfs_destroy_cachep(void)
4503{
4504 if (btrfs_inode_cachep)
4505 kmem_cache_destroy(btrfs_inode_cachep);
4506 if (btrfs_trans_handle_cachep)
4507 kmem_cache_destroy(btrfs_trans_handle_cachep);
4508 if (btrfs_transaction_cachep)
4509 kmem_cache_destroy(btrfs_transaction_cachep);
4510 if (btrfs_bit_radix_cachep)
4511 kmem_cache_destroy(btrfs_bit_radix_cachep);
4512 if (btrfs_path_cachep)
4513 kmem_cache_destroy(btrfs_path_cachep);
4514}
4515
4516struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4517 unsigned long extra_flags,
4518 void (*ctor)(void *))
4519{
4520 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4521 SLAB_MEM_SPREAD | extra_flags), ctor);
4522}
4523
4524int btrfs_init_cachep(void)
4525{
4526 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4527 sizeof(struct btrfs_inode),
4528 0, init_once);
4529 if (!btrfs_inode_cachep)
4530 goto fail;
4531 btrfs_trans_handle_cachep =
4532 btrfs_cache_create("btrfs_trans_handle_cache",
4533 sizeof(struct btrfs_trans_handle),
4534 0, NULL);
4535 if (!btrfs_trans_handle_cachep)
4536 goto fail;
4537 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4538 sizeof(struct btrfs_transaction),
4539 0, NULL);
4540 if (!btrfs_transaction_cachep)
4541 goto fail;
4542 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4543 sizeof(struct btrfs_path),
4544 0, NULL);
4545 if (!btrfs_path_cachep)
4546 goto fail;
4547 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4548 SLAB_DESTROY_BY_RCU, NULL);
4549 if (!btrfs_bit_radix_cachep)
4550 goto fail;
4551 return 0;
4552fail:
4553 btrfs_destroy_cachep();
4554 return -ENOMEM;
4555}
4556
4557static int btrfs_getattr(struct vfsmount *mnt,
4558 struct dentry *dentry, struct kstat *stat)
4559{
4560 struct inode *inode = dentry->d_inode;
4561 generic_fillattr(inode, stat);
4562 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4563 stat->blksize = PAGE_CACHE_SIZE;
4564 stat->blocks = (inode_get_bytes(inode) +
4565 BTRFS_I(inode)->delalloc_bytes) >> 9;
4566 return 0;
4567}
4568
4569static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4570 struct inode *new_dir, struct dentry *new_dentry)
4571{
4572 struct btrfs_trans_handle *trans;
4573 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4574 struct inode *new_inode = new_dentry->d_inode;
4575 struct inode *old_inode = old_dentry->d_inode;
4576 struct timespec ctime = CURRENT_TIME;
4577 u64 index = 0;
4578 int ret;
4579
4580 /* we're not allowed to rename between subvolumes */
4581 if (BTRFS_I(old_inode)->root->root_key.objectid !=
4582 BTRFS_I(new_dir)->root->root_key.objectid)
4583 return -EXDEV;
4584
4585 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4586 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4587 return -ENOTEMPTY;
4588 }
4589
4590 /* to rename a snapshot or subvolume, we need to juggle the
4591 * backrefs. This isn't coded yet
4592 */
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV;
4595
4596 ret = btrfs_check_free_space(root, 1, 0);
4597 if (ret)
4598 goto out_unlock;
4599
4600 trans = btrfs_start_transaction(root, 1);
4601
4602 btrfs_set_trans_block_group(trans, new_dir);
4603
4604 btrfs_inc_nlink(old_dentry->d_inode);
4605 old_dir->i_ctime = old_dir->i_mtime = ctime;
4606 new_dir->i_ctime = new_dir->i_mtime = ctime;
4607 old_inode->i_ctime = ctime;
4608
4609 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4610 old_dentry->d_name.name,
4611 old_dentry->d_name.len);
4612 if (ret)
4613 goto out_fail;
4614
4615 if (new_inode) {
4616 new_inode->i_ctime = CURRENT_TIME;
4617 ret = btrfs_unlink_inode(trans, root, new_dir,
4618 new_dentry->d_inode,
4619 new_dentry->d_name.name,
4620 new_dentry->d_name.len);
4621 if (ret)
4622 goto out_fail;
4623 if (new_inode->i_nlink == 0) {
4624 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4625 if (ret)
4626 goto out_fail;
4627 }
4628
4629 }
4630 ret = btrfs_set_inode_index(new_dir, &index);
4631 if (ret)
4632 goto out_fail;
4633
4634 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4635 old_inode, new_dentry->d_name.name,
4636 new_dentry->d_name.len, 1, index);
4637 if (ret)
4638 goto out_fail;
4639
4640out_fail:
4641 btrfs_end_transaction_throttle(trans, root);
4642out_unlock:
4643 return ret;
4644}
4645
4646/*
4647 * some fairly slow code that needs optimization. This walks the list
4648 * of all the inodes with pending delalloc and forces them to disk.
4649 */
4650int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4651{
4652 struct list_head *head = &root->fs_info->delalloc_inodes;
4653 struct btrfs_inode *binode;
4654 struct inode *inode;
4655
4656 if (root->fs_info->sb->s_flags & MS_RDONLY)
4657 return -EROFS;
4658
4659 spin_lock(&root->fs_info->delalloc_lock);
4660 while (!list_empty(head)) {
4661 binode = list_entry(head->next, struct btrfs_inode,
4662 delalloc_inodes);
4663 inode = igrab(&binode->vfs_inode);
4664 if (!inode)
4665 list_del_init(&binode->delalloc_inodes);
4666 spin_unlock(&root->fs_info->delalloc_lock);
4667 if (inode) {
4668 filemap_flush(inode->i_mapping);
4669 iput(inode);
4670 }
4671 cond_resched();
4672 spin_lock(&root->fs_info->delalloc_lock);
4673 }
4674 spin_unlock(&root->fs_info->delalloc_lock);
4675
4676 /* the filemap_flush will queue IO into the worker threads, but
4677 * we have to make sure the IO is actually started and that
4678 * ordered extents get created before we return
4679 */
4680 atomic_inc(&root->fs_info->async_submit_draining);
4681 while (atomic_read(&root->fs_info->nr_async_submits) ||
4682 atomic_read(&root->fs_info->async_delalloc_pages)) {
4683 wait_event(root->fs_info->async_submit_wait,
4684 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4685 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4686 }
4687 atomic_dec(&root->fs_info->async_submit_draining);
4688 return 0;
4689}
4690
4691static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4692 const char *symname)
4693{
4694 struct btrfs_trans_handle *trans;
4695 struct btrfs_root *root = BTRFS_I(dir)->root;
4696 struct btrfs_path *path;
4697 struct btrfs_key key;
4698 struct inode *inode = NULL;
4699 int err;
4700 int drop_inode = 0;
4701 u64 objectid;
4702 u64 index = 0 ;
4703 int name_len;
4704 int datasize;
4705 unsigned long ptr;
4706 struct btrfs_file_extent_item *ei;
4707 struct extent_buffer *leaf;
4708 unsigned long nr = 0;
4709
4710 name_len = strlen(symname) + 1;
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG;
4713
4714 err = btrfs_check_free_space(root, 1, 0);
4715 if (err)
4716 goto out_fail;
4717
4718 trans = btrfs_start_transaction(root, 1);
4719 btrfs_set_trans_block_group(trans, dir);
4720
4721 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4722 if (err) {
4723 err = -ENOSPC;
4724 goto out_unlock;
4725 }
4726
4727 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4728 dentry->d_name.len,
4729 dentry->d_parent->d_inode->i_ino, objectid,
4730 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4731 &index);
4732 err = PTR_ERR(inode);
4733 if (IS_ERR(inode))
4734 goto out_unlock;
4735
4736 err = btrfs_init_acl(inode, dir);
4737 if (err) {
4738 drop_inode = 1;
4739 goto out_unlock;
4740 }
4741
4742 btrfs_set_trans_block_group(trans, inode);
4743 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4744 if (err)
4745 drop_inode = 1;
4746 else {
4747 inode->i_mapping->a_ops = &btrfs_aops;
4748 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4749 inode->i_fop = &btrfs_file_operations;
4750 inode->i_op = &btrfs_file_inode_operations;
4751 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4752 }
4753 dir->i_sb->s_dirt = 1;
4754 btrfs_update_inode_block_group(trans, inode);
4755 btrfs_update_inode_block_group(trans, dir);
4756 if (drop_inode)
4757 goto out_unlock;
4758
4759 path = btrfs_alloc_path();
4760 BUG_ON(!path);
4761 key.objectid = inode->i_ino;
4762 key.offset = 0;
4763 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4764 datasize = btrfs_file_extent_calc_inline_size(name_len);
4765 err = btrfs_insert_empty_item(trans, root, path, &key,
4766 datasize);
4767 if (err) {
4768 drop_inode = 1;
4769 goto out_unlock;
4770 }
4771 leaf = path->nodes[0];
4772 ei = btrfs_item_ptr(leaf, path->slots[0],
4773 struct btrfs_file_extent_item);
4774 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4775 btrfs_set_file_extent_type(leaf, ei,
4776 BTRFS_FILE_EXTENT_INLINE);
4777 btrfs_set_file_extent_encryption(leaf, ei, 0);
4778 btrfs_set_file_extent_compression(leaf, ei, 0);
4779 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4780 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4781
4782 ptr = btrfs_file_extent_inline_start(ei);
4783 write_extent_buffer(leaf, symname, ptr, name_len);
4784 btrfs_mark_buffer_dirty(leaf);
4785 btrfs_free_path(path);
4786
4787 inode->i_op = &btrfs_symlink_inode_operations;
4788 inode->i_mapping->a_ops = &btrfs_symlink_aops;
4789 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4790 inode_set_bytes(inode, name_len);
4791 btrfs_i_size_write(inode, name_len - 1);
4792 err = btrfs_update_inode(trans, root, inode);
4793 if (err)
4794 drop_inode = 1;
4795
4796out_unlock:
4797 nr = trans->blocks_used;
4798 btrfs_end_transaction_throttle(trans, root);
4799out_fail:
4800 if (drop_inode) {
4801 inode_dec_link_count(inode);
4802 iput(inode);
4803 }
4804 btrfs_btree_balance_dirty(root, nr);
4805 return err;
4806}
4807
4808static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4809 u64 alloc_hint, int mode)
4810{
4811 struct btrfs_trans_handle *trans;
4812 struct btrfs_root *root = BTRFS_I(inode)->root;
4813 struct btrfs_key ins;
4814 u64 alloc_size;
4815 u64 cur_offset = start;
4816 u64 num_bytes = end - start;
4817 int ret = 0;
4818
4819 trans = btrfs_join_transaction(root, 1);
4820 BUG_ON(!trans);
4821 btrfs_set_trans_block_group(trans, inode);
4822
4823 while (num_bytes > 0) {
4824 alloc_size = min(num_bytes, root->fs_info->max_extent);
4825 ret = btrfs_reserve_extent(trans, root, alloc_size,
4826 root->sectorsize, 0, alloc_hint,
4827 (u64)-1, &ins, 1);
4828 if (ret) {
4829 WARN_ON(1);
4830 goto out;
4831 }
4832 ret = insert_reserved_file_extent(trans, inode,
4833 cur_offset, ins.objectid,
4834 ins.offset, ins.offset,
4835 ins.offset, 0, 0, 0,
4836 BTRFS_FILE_EXTENT_PREALLOC);
4837 BUG_ON(ret);
4838 num_bytes -= ins.offset;
4839 cur_offset += ins.offset;
4840 alloc_hint = ins.objectid + ins.offset;
4841 }
4842out:
4843 if (cur_offset > start) {
4844 inode->i_ctime = CURRENT_TIME;
4845 btrfs_set_flag(inode, PREALLOC);
4846 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4847 cur_offset > i_size_read(inode))
4848 btrfs_i_size_write(inode, cur_offset);
4849 ret = btrfs_update_inode(trans, root, inode);
4850 BUG_ON(ret);
4851 }
4852
4853 btrfs_end_transaction(trans, root);
4854 return ret;
4855}
4856
4857static long btrfs_fallocate(struct inode *inode, int mode,
4858 loff_t offset, loff_t len)
4859{
4860 u64 cur_offset;
4861 u64 last_byte;
4862 u64 alloc_start;
4863 u64 alloc_end;
4864 u64 alloc_hint = 0;
4865 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4866 struct extent_map *em;
4867 int ret;
4868
4869 alloc_start = offset & ~mask;
4870 alloc_end = (offset + len + mask) & ~mask;
4871
4872 mutex_lock(&inode->i_mutex);
4873 if (alloc_start > inode->i_size) {
4874 ret = btrfs_cont_expand(inode, alloc_start);
4875 if (ret)
4876 goto out;
4877 }
4878
4879 while (1) {
4880 struct btrfs_ordered_extent *ordered;
4881 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4882 alloc_end - 1, GFP_NOFS);
4883 ordered = btrfs_lookup_first_ordered_extent(inode,
4884 alloc_end - 1);
4885 if (ordered &&
4886 ordered->file_offset + ordered->len > alloc_start &&
4887 ordered->file_offset < alloc_end) {
4888 btrfs_put_ordered_extent(ordered);
4889 unlock_extent(&BTRFS_I(inode)->io_tree,
4890 alloc_start, alloc_end - 1, GFP_NOFS);
4891 btrfs_wait_ordered_range(inode, alloc_start,
4892 alloc_end - alloc_start);
4893 } else {
4894 if (ordered)
4895 btrfs_put_ordered_extent(ordered);
4896 break;
4897 }
4898 }
4899
4900 cur_offset = alloc_start;
4901 while (1) {
4902 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4903 alloc_end - cur_offset, 0);
4904 BUG_ON(IS_ERR(em) || !em);
4905 last_byte = min(extent_map_end(em), alloc_end);
4906 last_byte = (last_byte + mask) & ~mask;
4907 if (em->block_start == EXTENT_MAP_HOLE) {
4908 ret = prealloc_file_range(inode, cur_offset,
4909 last_byte, alloc_hint, mode);
4910 if (ret < 0) {
4911 free_extent_map(em);
4912 break;
4913 }
4914 }
4915 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4916 alloc_hint = em->block_start;
4917 free_extent_map(em);
4918
4919 cur_offset = last_byte;
4920 if (cur_offset >= alloc_end) {
4921 ret = 0;
4922 break;
4923 }
4924 }
4925 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4926 GFP_NOFS);
4927out:
4928 mutex_unlock(&inode->i_mutex);
4929 return ret;
4930}
4931
4932static int btrfs_set_page_dirty(struct page *page)
4933{
4934 return __set_page_dirty_nobuffers(page);
4935}
4936
4937static int btrfs_permission(struct inode *inode, int mask)
4938{
4939 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4940 return -EACCES;
4941 return generic_permission(inode, mask, btrfs_check_acl);
4942}
4943
4944static struct inode_operations btrfs_dir_inode_operations = {
4945 .getattr = btrfs_getattr,
4946 .lookup = btrfs_lookup,
4947 .create = btrfs_create,
4948 .unlink = btrfs_unlink,
4949 .link = btrfs_link,
4950 .mkdir = btrfs_mkdir,
4951 .rmdir = btrfs_rmdir,
4952 .rename = btrfs_rename,
4953 .symlink = btrfs_symlink,
4954 .setattr = btrfs_setattr,
4955 .mknod = btrfs_mknod,
4956 .setxattr = btrfs_setxattr,
4957 .getxattr = btrfs_getxattr,
4958 .listxattr = btrfs_listxattr,
4959 .removexattr = btrfs_removexattr,
4960 .permission = btrfs_permission,
4961};
4962static struct inode_operations btrfs_dir_ro_inode_operations = {
4963 .lookup = btrfs_lookup,
4964 .permission = btrfs_permission,
4965};
4966static struct file_operations btrfs_dir_file_operations = {
4967 .llseek = generic_file_llseek,
4968 .read = generic_read_dir,
4969 .readdir = btrfs_real_readdir,
4970 .unlocked_ioctl = btrfs_ioctl,
4971#ifdef CONFIG_COMPAT
4972 .compat_ioctl = btrfs_ioctl,
4973#endif
4974 .release = btrfs_release_file,
4975 .fsync = btrfs_sync_file,
4976};
4977
4978static struct extent_io_ops btrfs_extent_io_ops = {
4979 .fill_delalloc = run_delalloc_range,
4980 .submit_bio_hook = btrfs_submit_bio_hook,
4981 .merge_bio_hook = btrfs_merge_bio_hook,
4982 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4983 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4984 .writepage_start_hook = btrfs_writepage_start_hook,
4985 .readpage_io_failed_hook = btrfs_io_failed_hook,
4986 .set_bit_hook = btrfs_set_bit_hook,
4987 .clear_bit_hook = btrfs_clear_bit_hook,
4988};
4989
4990static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage,
5000 .set_page_dirty = btrfs_set_page_dirty,
5001};
5002
5003static struct address_space_operations btrfs_symlink_aops = {
5004 .readpage = btrfs_readpage,
5005 .writepage = btrfs_writepage,
5006 .invalidatepage = btrfs_invalidatepage,
5007 .releasepage = btrfs_releasepage,
5008};
5009
5010static struct inode_operations btrfs_file_inode_operations = {
5011 .truncate = btrfs_truncate,
5012 .getattr = btrfs_getattr,
5013 .setattr = btrfs_setattr,
5014 .setxattr = btrfs_setxattr,
5015 .getxattr = btrfs_getxattr,
5016 .listxattr = btrfs_listxattr,
5017 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate,
5020};
5021static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr,
5023 .setattr = btrfs_setattr,
5024 .permission = btrfs_permission,
5025 .setxattr = btrfs_setxattr,
5026 .getxattr = btrfs_getxattr,
5027 .listxattr = btrfs_listxattr,
5028 .removexattr = btrfs_removexattr,
5029};
5030static struct inode_operations btrfs_symlink_inode_operations = {
5031 .readlink = generic_readlink,
5032 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link,
5034 .permission = btrfs_permission,
5035};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "compat.h"
45#include "ctree.h"
46#include "disk-io.h"
47#include "transaction.h"
48#include "btrfs_inode.h"
49#include "ioctl.h"
50#include "print-tree.h"
51#include "volumes.h"
52#include "locking.h"
53
54
55
56static noinline int create_subvol(struct btrfs_root *root,
57 struct dentry *dentry,
58 char *name, int namelen)
59{
60 struct btrfs_trans_handle *trans;
61 struct btrfs_key key;
62 struct btrfs_root_item root_item;
63 struct btrfs_inode_item *inode_item;
64 struct extent_buffer *leaf;
65 struct btrfs_root *new_root = root;
66 struct inode *dir;
67 int ret;
68 int err;
69 u64 objectid;
70 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
71 u64 index = 0;
72 unsigned long nr = 1;
73
74 ret = btrfs_check_free_space(root, 1, 0);
75 if (ret)
76 goto fail_commit;
77
78 trans = btrfs_start_transaction(root, 1);
79 BUG_ON(!trans);
80
81 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
82 0, &objectid);
83 if (ret)
84 goto fail;
85
86 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
87 objectid, trans->transid, 0, 0, 0);
88 if (IS_ERR(leaf)) {
89 ret = PTR_ERR(leaf);
90 goto fail;
91 }
92
93 btrfs_set_header_nritems(leaf, 0);
94 btrfs_set_header_level(leaf, 0);
95 btrfs_set_header_bytenr(leaf, leaf->start);
96 btrfs_set_header_generation(leaf, trans->transid);
97 btrfs_set_header_owner(leaf, objectid);
98
99 write_extent_buffer(leaf, root->fs_info->fsid,
100 (unsigned long)btrfs_header_fsid(leaf),
101 BTRFS_FSID_SIZE);
102 btrfs_mark_buffer_dirty(leaf);
103
104 inode_item = &root_item.inode;
105 memset(inode_item, 0, sizeof(*inode_item));
106 inode_item->generation = cpu_to_le64(1);
107 inode_item->size = cpu_to_le64(3);
108 inode_item->nlink = cpu_to_le32(1);
109 inode_item->nbytes = cpu_to_le64(root->leafsize);
110 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
111
112 btrfs_set_root_bytenr(&root_item, leaf->start);
113 btrfs_set_root_generation(&root_item, trans->transid);
114 btrfs_set_root_level(&root_item, 0);
115 btrfs_set_root_refs(&root_item, 1);
116 btrfs_set_root_used(&root_item, 0);
117 btrfs_set_root_last_snapshot(&root_item, 0);
118
119 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
120 root_item.drop_level = 0;
121
122 btrfs_tree_unlock(leaf);
123 free_extent_buffer(leaf);
124 leaf = NULL;
125
126 btrfs_set_root_dirid(&root_item, new_dirid);
127
128 key.objectid = objectid;
129 key.offset = 1;
130 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
131 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
132 &root_item);
133 if (ret)
134 goto fail;
135
136 /*
137 * insert the directory item
138 */
139 key.offset = (u64)-1;
140 dir = dentry->d_parent->d_inode;
141 ret = btrfs_set_inode_index(dir, &index);
142 BUG_ON(ret);
143
144 ret = btrfs_insert_dir_item(trans, root,
145 name, namelen, dir->i_ino, &key,
146 BTRFS_FT_DIR, index);
147 if (ret)
148 goto fail;
149
150 btrfs_i_size_write(dir, dir->i_size + namelen * 2);
151 ret = btrfs_update_inode(trans, root, dir);
152 BUG_ON(ret);
153
154 /* add the backref first */
155 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
156 objectid, BTRFS_ROOT_BACKREF_KEY,
157 root->root_key.objectid,
158 dir->i_ino, index, name, namelen);
159
160 BUG_ON(ret);
161
162 /* now add the forward ref */
163 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
164 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
165 objectid,
166 dir->i_ino, index, name, namelen);
167
168 BUG_ON(ret);
169
170 ret = btrfs_commit_transaction(trans, root);
171 if (ret)
172 goto fail_commit;
173
174 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
175 BUG_ON(!new_root);
176
177 trans = btrfs_start_transaction(new_root, 1);
178 BUG_ON(!trans);
179
180 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
181 BTRFS_I(dir)->block_group);
182 if (ret)
183 goto fail;
184
185fail:
186 nr = trans->blocks_used;
187 err = btrfs_commit_transaction(trans, new_root);
188 if (err && !ret)
189 ret = err;
190fail_commit:
191 btrfs_btree_balance_dirty(root, nr);
192 return ret;
193}
194
195static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
196 char *name, int namelen)
197{
198 struct btrfs_pending_snapshot *pending_snapshot;
199 struct btrfs_trans_handle *trans;
200 int ret = 0;
201 int err;
202 unsigned long nr = 0;
203
204 if (!root->ref_cows)
205 return -EINVAL;
206
207 ret = btrfs_check_free_space(root, 1, 0);
208 if (ret)
209 goto fail_unlock;
210
211 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
212 if (!pending_snapshot) {
213 ret = -ENOMEM;
214 goto fail_unlock;
215 }
216 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
217 if (!pending_snapshot->name) {
218 ret = -ENOMEM;
219 kfree(pending_snapshot);
220 goto fail_unlock;
221 }
222 memcpy(pending_snapshot->name, name, namelen);
223 pending_snapshot->name[namelen] = '\0';
224 pending_snapshot->dentry = dentry;
225 trans = btrfs_start_transaction(root, 1);
226 BUG_ON(!trans);
227 pending_snapshot->root = root;
228 list_add(&pending_snapshot->list,
229 &trans->transaction->pending_snapshots);
230 err = btrfs_commit_transaction(trans, root);
231
232fail_unlock:
233 btrfs_btree_balance_dirty(root, nr);
234 return ret;
235}
236
237/* copy of may_create in fs/namei.c() */
238static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
239{
240 if (child->d_inode)
241 return -EEXIST;
242 if (IS_DEADDIR(dir))
243 return -ENOENT;
244 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
245}
246
247/*
248 * Create a new subvolume below @parent. This is largely modeled after
249 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
250 * inside this filesystem so it's quite a bit simpler.
251 */
252static noinline int btrfs_mksubvol(struct path *parent, char *name,
253 int mode, int namelen,
254 struct btrfs_root *snap_src)
255{
256 struct dentry *dentry;
257 int error;
258
259 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
260
261 dentry = lookup_one_len(name, parent->dentry, namelen);
262 error = PTR_ERR(dentry);
263 if (IS_ERR(dentry))
264 goto out_unlock;
265
266 error = -EEXIST;
267 if (dentry->d_inode)
268 goto out_dput;
269
270 if (!IS_POSIXACL(parent->dentry->d_inode))
271 mode &= ~current->fs->umask;
272
273 error = mnt_want_write(parent->mnt);
274 if (error)
275 goto out_dput;
276
277 error = btrfs_may_create(parent->dentry->d_inode, dentry);
278 if (error)
279 goto out_drop_write;
280
281 /*
282 * Actually perform the low-level subvolume creation after all
283 * this VFS fuzz.
284 *
285 * Eventually we want to pass in an inode under which we create this
286 * subvolume, but for now all are under the filesystem root.
287 *
288 * Also we should pass on the mode eventually to allow creating new
289 * subvolume with specific mode bits.
290 */
291 if (snap_src) {
292 struct dentry *dir = dentry->d_parent;
293 struct dentry *test = dir->d_parent;
294 struct btrfs_path *path = btrfs_alloc_path();
295 int ret;
296 u64 test_oid;
297 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
298
299 test_oid = snap_src->root_key.objectid;
300
301 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
302 path, parent_oid, test_oid);
303 if (ret == 0)
304 goto create;
305 btrfs_release_path(snap_src->fs_info->tree_root, path);
306
307 /* we need to make sure we aren't creating a directory loop
308 * by taking a snapshot of something that has our current
309 * subvol in its directory tree. So, this loops through
310 * the dentries and checks the forward refs for each subvolume
311 * to see if is references the subvolume where we are
312 * placing this new snapshot.
313 */
314 while (1) {
315 if (!test ||
316 dir == snap_src->fs_info->sb->s_root ||
317 test == snap_src->fs_info->sb->s_root ||
318 test->d_inode->i_sb != snap_src->fs_info->sb) {
319 break;
320 }
321 if (S_ISLNK(test->d_inode->i_mode)) {
322 printk(KERN_INFO "Btrfs symlink in snapshot "
323 "path, failed\n");
324 error = -EMLINK;
325 btrfs_free_path(path);
326 goto out_drop_write;
327 }
328 test_oid =
329 BTRFS_I(test->d_inode)->root->root_key.objectid;
330 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
331 path, test_oid, parent_oid);
332 if (ret == 0) {
333 printk(KERN_INFO "Btrfs snapshot creation "
334 "failed, looping\n");
335 error = -EMLINK;
336 btrfs_free_path(path);
337 goto out_drop_write;
338 }
339 btrfs_release_path(snap_src->fs_info->tree_root, path);
340 test = test->d_parent;
341 }
342create:
343 btrfs_free_path(path);
344 error = create_snapshot(snap_src, dentry, name, namelen);
345 } else {
346 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
347 dentry, name, namelen);
348 }
349 if (error)
350 goto out_drop_write;
351
352 fsnotify_mkdir(parent->dentry->d_inode, dentry);
353out_drop_write:
354 mnt_drop_write(parent->mnt);
355out_dput:
356 dput(dentry);
357out_unlock:
358 mutex_unlock(&parent->dentry->d_inode->i_mutex);
359 return error;
360}
361
362
363static int btrfs_defrag_file(struct file *file)
364{
365 struct inode *inode = fdentry(file)->d_inode;
366 struct btrfs_root *root = BTRFS_I(inode)->root;
367 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
368 struct btrfs_ordered_extent *ordered;
369 struct page *page;
370 unsigned long last_index;
371 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
372 unsigned long total_read = 0;
373 u64 page_start;
374 u64 page_end;
375 unsigned long i;
376 int ret;
377
378 ret = btrfs_check_free_space(root, inode->i_size, 0);
379 if (ret)
380 return -ENOSPC;
381
382 mutex_lock(&inode->i_mutex);
383 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
384 for (i = 0; i <= last_index; i++) {
385 if (total_read % ra_pages == 0) {
386 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
387 min(last_index, i + ra_pages - 1));
388 }
389 total_read++;
390again:
391 page = grab_cache_page(inode->i_mapping, i);
392 if (!page)
393 goto out_unlock;
394 if (!PageUptodate(page)) {
395 btrfs_readpage(NULL, page);
396 lock_page(page);
397 if (!PageUptodate(page)) {
398 unlock_page(page);
399 page_cache_release(page);
400 goto out_unlock;
401 }
402 }
403
404 wait_on_page_writeback(page);
405
406 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
407 page_end = page_start + PAGE_CACHE_SIZE - 1;
408 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
409
410 ordered = btrfs_lookup_ordered_extent(inode, page_start);
411 if (ordered) {
412 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
413 unlock_page(page);
414 page_cache_release(page);
415 btrfs_start_ordered_extent(inode, ordered, 1);
416 btrfs_put_ordered_extent(ordered);
417 goto again;
418 }
419 set_page_extent_mapped(page);
420
421 /*
422 * this makes sure page_mkwrite is called on the
423 * page if it is dirtied again later
424 */
425 clear_page_dirty_for_io(page);
426
427 btrfs_set_extent_delalloc(inode, page_start, page_end);
428
429 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
430 set_page_dirty(page);
431 unlock_page(page);
432 page_cache_release(page);
433 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
434 }
435
436out_unlock:
437 mutex_unlock(&inode->i_mutex);
438 return 0;
439}
440
441/*
442 * Called inside transaction, so use GFP_NOFS
443 */
444
445static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
446{
447 u64 new_size;
448 u64 old_size;
449 u64 devid = 1;
450 struct btrfs_ioctl_vol_args *vol_args;
451 struct btrfs_trans_handle *trans;
452 struct btrfs_device *device = NULL;
453 char *sizestr;
454 char *devstr = NULL;
455 int ret = 0;
456 int namelen;
457 int mod = 0;
458
459 if (root->fs_info->sb->s_flags & MS_RDONLY)
460 return -EROFS;
461
462 if (!capable(CAP_SYS_ADMIN))
463 return -EPERM;
464
465 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
466
467 if (!vol_args)
468 return -ENOMEM;
469
470 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
476 namelen = strlen(vol_args->name);
477
478 mutex_lock(&root->fs_info->volume_mutex);
479 sizestr = vol_args->name;
480 devstr = strchr(sizestr, ':');
481 if (devstr) {
482 char *end;
483 sizestr = devstr + 1;
484 *devstr = '\0';
485 devstr = vol_args->name;
486 devid = simple_strtoull(devstr, &end, 10);
487 printk(KERN_INFO "resizing devid %llu\n", devid);
488 }
489 device = btrfs_find_device(root, devid, NULL, NULL);
490 if (!device) {
491 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
492 ret = -EINVAL;
493 goto out_unlock;
494 }
495 if (!strcmp(sizestr, "max"))
496 new_size = device->bdev->bd_inode->i_size;
497 else {
498 if (sizestr[0] == '-') {
499 mod = -1;
500 sizestr++;
501 } else if (sizestr[0] == '+') {
502 mod = 1;
503 sizestr++;
504 }
505 new_size = btrfs_parse_size(sizestr);
506 if (new_size == 0) {
507 ret = -EINVAL;
508 goto out_unlock;
509 }
510 }
511
512 old_size = device->total_bytes;
513
514 if (mod < 0) {
515 if (new_size > old_size) {
516 ret = -EINVAL;
517 goto out_unlock;
518 }
519 new_size = old_size - new_size;
520 } else if (mod > 0) {
521 new_size = old_size + new_size;
522 }
523
524 if (new_size < 256 * 1024 * 1024) {
525 ret = -EINVAL;
526 goto out_unlock;
527 }
528 if (new_size > device->bdev->bd_inode->i_size) {
529 ret = -EFBIG;
530 goto out_unlock;
531 }
532
533 do_div(new_size, root->sectorsize);
534 new_size *= root->sectorsize;
535
536 printk(KERN_INFO "new size for %s is %llu\n",
537 device->name, (unsigned long long)new_size);
538
539 if (new_size > old_size) {
540 trans = btrfs_start_transaction(root, 1);
541 ret = btrfs_grow_device(trans, device, new_size);
542 btrfs_commit_transaction(trans, root);
543 } else {
544 ret = btrfs_shrink_device(device, new_size);
545 }
546
547out_unlock:
548 mutex_unlock(&root->fs_info->volume_mutex);
549out:
550 kfree(vol_args);
551 return ret;
552}
553
554static noinline int btrfs_ioctl_snap_create(struct file *file,
555 void __user *arg, int subvol)
556{
557 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
558 struct btrfs_ioctl_vol_args *vol_args;
559 struct btrfs_dir_item *di;
560 struct btrfs_path *path;
561 struct file *src_file;
562 u64 root_dirid;
563 int namelen;
564 int ret = 0;
565
566 if (root->fs_info->sb->s_flags & MS_RDONLY)
567 return -EROFS;
568
569 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
570
571 if (!vol_args)
572 return -ENOMEM;
573
574 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
575 ret = -EFAULT;
576 goto out;
577 }
578
579 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
580 namelen = strlen(vol_args->name);
581 if (strchr(vol_args->name, '/')) {
582 ret = -EINVAL;
583 goto out;
584 }
585
586 path = btrfs_alloc_path();
587 if (!path) {
588 ret = -ENOMEM;
589 goto out;
590 }
591
592 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
593 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
594 path, root_dirid,
595 vol_args->name, namelen, 0);
596 btrfs_free_path(path);
597
598 if (di && !IS_ERR(di)) {
599 ret = -EEXIST;
600 goto out;
601 }
602
603 if (IS_ERR(di)) {
604 ret = PTR_ERR(di);
605 goto out;
606 }
607
608 if (subvol) {
609 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
610 file->f_path.dentry->d_inode->i_mode,
611 namelen, NULL);
612 } else {
613 struct inode *src_inode;
614 src_file = fget(vol_args->fd);
615 if (!src_file) {
616 ret = -EINVAL;
617 goto out;
618 }
619
620 src_inode = src_file->f_path.dentry->d_inode;
621 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
622 printk(KERN_INFO "btrfs: Snapshot src from "
623 "another FS\n");
624 ret = -EINVAL;
625 fput(src_file);
626 goto out;
627 }
628 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
629 file->f_path.dentry->d_inode->i_mode,
630 namelen, BTRFS_I(src_inode)->root);
631 fput(src_file);
632 }
633
634out:
635 kfree(vol_args);
636 return ret;
637}
638
639static int btrfs_ioctl_defrag(struct file *file)
640{
641 struct inode *inode = fdentry(file)->d_inode;
642 struct btrfs_root *root = BTRFS_I(inode)->root;
643 int ret;
644
645 ret = mnt_want_write(file->f_path.mnt);
646 if (ret)
647 return ret;
648
649 switch (inode->i_mode & S_IFMT) {
650 case S_IFDIR:
651 if (!capable(CAP_SYS_ADMIN)) {
652 ret = -EPERM;
653 goto out;
654 }
655 btrfs_defrag_root(root, 0);
656 btrfs_defrag_root(root->fs_info->extent_root, 0);
657 break;
658 case S_IFREG:
659 if (!(file->f_mode & FMODE_WRITE)) {
660 ret = -EINVAL;
661 goto out;
662 }
663 btrfs_defrag_file(file);
664 break;
665 }
666out:
667 mnt_drop_write(file->f_path.mnt);
668 return ret;
669}
670
671static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
672{
673 struct btrfs_ioctl_vol_args *vol_args;
674 int ret;
675
676 if (!capable(CAP_SYS_ADMIN))
677 return -EPERM;
678
679 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
680
681 if (!vol_args)
682 return -ENOMEM;
683
684 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
685 ret = -EFAULT;
686 goto out;
687 }
688 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
689 ret = btrfs_init_new_device(root, vol_args->name);
690
691out:
692 kfree(vol_args);
693 return ret;
694}
695
696static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
697{
698 struct btrfs_ioctl_vol_args *vol_args;
699 int ret;
700
701 if (!capable(CAP_SYS_ADMIN))
702 return -EPERM;
703
704 if (root->fs_info->sb->s_flags & MS_RDONLY)
705 return -EROFS;
706
707 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
708
709 if (!vol_args)
710 return -ENOMEM;
711
712 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
713 ret = -EFAULT;
714 goto out;
715 }
716 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
717 ret = btrfs_rm_device(root, vol_args->name);
718
719out:
720 kfree(vol_args);
721 return ret;
722}
723
724static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
725 u64 off, u64 olen, u64 destoff)
726{
727 struct inode *inode = fdentry(file)->d_inode;
728 struct btrfs_root *root = BTRFS_I(inode)->root;
729 struct file *src_file;
730 struct inode *src;
731 struct btrfs_trans_handle *trans;
732 struct btrfs_path *path;
733 struct extent_buffer *leaf;
734 char *buf;
735 struct btrfs_key key;
736 u32 nritems;
737 int slot;
738 int ret;
739 u64 len = olen;
740 u64 bs = root->fs_info->sb->s_blocksize;
741 u64 hint_byte;
742
743 /*
744 * TODO:
745 * - split compressed inline extents. annoying: we need to
746 * decompress into destination's address_space (the file offset
747 * may change, so source mapping won't do), then recompress (or
748 * otherwise reinsert) a subrange.
749 * - allow ranges within the same file to be cloned (provided
750 * they don't overlap)?
751 */
752
753 /* the destination must be opened for writing */
754 if (!(file->f_mode & FMODE_WRITE))
755 return -EINVAL;
756
757 ret = mnt_want_write(file->f_path.mnt);
758 if (ret)
759 return ret;
760
761 src_file = fget(srcfd);
762 if (!src_file) {
763 ret = -EBADF;
764 goto out_drop_write;
765 }
766 src = src_file->f_dentry->d_inode;
767
768 ret = -EINVAL;
769 if (src == inode)
770 goto out_fput;
771
772 ret = -EISDIR;
773 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
774 goto out_fput;
775
776 ret = -EXDEV;
777 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
778 goto out_fput;
779
780 ret = -ENOMEM;
781 buf = vmalloc(btrfs_level_size(root, 0));
782 if (!buf)
783 goto out_fput;
784
785 path = btrfs_alloc_path();
786 if (!path) {
787 vfree(buf);
788 goto out_fput;
789 }
790 path->reada = 2;
791
792 if (inode < src) {
793 mutex_lock(&inode->i_mutex);
794 mutex_lock(&src->i_mutex);
795 } else {
796 mutex_lock(&src->i_mutex);
797 mutex_lock(&inode->i_mutex);
798 }
799
800 /* determine range to clone */
801 ret = -EINVAL;
802 if (off >= src->i_size || off + len > src->i_size)
803 goto out_unlock;
804 if (len == 0)
805 olen = len = src->i_size - off;
806 /* if we extend to eof, continue to block boundary */
807 if (off + len == src->i_size)
808 len = ((src->i_size + bs-1) & ~(bs-1))
809 - off;
810
811 /* verify the end result is block aligned */
812 if ((off & (bs-1)) ||
813 ((off + len) & (bs-1)))
814 goto out_unlock;
815
816 /* do any pending delalloc/csum calc on src, one way or
817 another, and lock file content */
818 while (1) {
819 struct btrfs_ordered_extent *ordered;
820 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
821 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
822 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
823 break;
824 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
825 if (ordered)
826 btrfs_put_ordered_extent(ordered);
827 btrfs_wait_ordered_range(src, off, off+len);
828 }
829
830 trans = btrfs_start_transaction(root, 1);
831 BUG_ON(!trans);
832
833 /* punch hole in destination first */
834 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
835
836 /* clone data */
837 key.objectid = src->i_ino;
838 key.type = BTRFS_EXTENT_DATA_KEY;
839 key.offset = 0;
840
841 while (1) {
842 /*
843 * note the key will change type as we walk through the
844 * tree.
845 */
846 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
847 if (ret < 0)
848 goto out;
849
850 nritems = btrfs_header_nritems(path->nodes[0]);
851 if (path->slots[0] >= nritems) {
852 ret = btrfs_next_leaf(root, path);
853 if (ret < 0)
854 goto out;
855 if (ret > 0)
856 break;
857 nritems = btrfs_header_nritems(path->nodes[0]);
858 }
859 leaf = path->nodes[0];
860 slot = path->slots[0];
861
862 btrfs_item_key_to_cpu(leaf, &key, slot);
863 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
864 key.objectid != src->i_ino)
865 break;
866
867 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
868 struct btrfs_file_extent_item *extent;
869 int type;
870 u32 size;
871 struct btrfs_key new_key;
872 u64 disko = 0, diskl = 0;
873 u64 datao = 0, datal = 0;
874 u8 comp;
875
876 size = btrfs_item_size_nr(leaf, slot);
877 read_extent_buffer(leaf, buf,
878 btrfs_item_ptr_offset(leaf, slot),
879 size);
880
881 extent = btrfs_item_ptr(leaf, slot,
882 struct btrfs_file_extent_item);
883 comp = btrfs_file_extent_compression(leaf, extent);
884 type = btrfs_file_extent_type(leaf, extent);
885 if (type == BTRFS_FILE_EXTENT_REG) {
886 disko = btrfs_file_extent_disk_bytenr(leaf,
887 extent);
888 diskl = btrfs_file_extent_disk_num_bytes(leaf,
889 extent);
890 datao = btrfs_file_extent_offset(leaf, extent);
891 datal = btrfs_file_extent_num_bytes(leaf,
892 extent);
893 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
894 /* take upper bound, may be compressed */
895 datal = btrfs_file_extent_ram_bytes(leaf,
896 extent);
897 }
898 btrfs_release_path(root, path);
899
900 if (key.offset + datal < off ||
901 key.offset >= off+len)
902 goto next;
903
904 memcpy(&new_key, &key, sizeof(new_key));
905 new_key.objectid = inode->i_ino;
906 new_key.offset = key.offset + destoff - off;
907
908 if (type == BTRFS_FILE_EXTENT_REG) {
909 ret = btrfs_insert_empty_item(trans, root, path,
910 &new_key, size);
911 if (ret)
912 goto out;
913
914 leaf = path->nodes[0];
915 slot = path->slots[0];
916 write_extent_buffer(leaf, buf,
917 btrfs_item_ptr_offset(leaf, slot),
918 size);
919
920 extent = btrfs_item_ptr(leaf, slot,
921 struct btrfs_file_extent_item);
922
923 if (off > key.offset) {
924 datao += off - key.offset;
925 datal -= off - key.offset;
926 }
927 if (key.offset + datao + datal + key.offset >
928 off + len)
929 datal = off + len - key.offset - datao;
930 /* disko == 0 means it's a hole */
931 if (!disko)
932 datao = 0;
933
934 btrfs_set_file_extent_offset(leaf, extent,
935 datao);
936 btrfs_set_file_extent_num_bytes(leaf, extent,
937 datal);
938 if (disko) {
939 inode_add_bytes(inode, datal);
940 ret = btrfs_inc_extent_ref(trans, root,
941 disko, diskl, leaf->start,
942 root->root_key.objectid,
943 trans->transid,
944 inode->i_ino);
945 BUG_ON(ret);
946 }
947 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
948 u64 skip = 0;
949 u64 trim = 0;
950 if (off > key.offset) {
951 skip = off - key.offset;
952 new_key.offset += skip;
953 }
954
955 if (key.offset + datal > off+len)
956 trim = key.offset + datal - (off+len);
957
958 if (comp && (skip || trim)) {
959 ret = -EINVAL;
960 goto out;
961 }
962 size -= skip + trim;
963 datal -= skip + trim;
964 ret = btrfs_insert_empty_item(trans, root, path,
965 &new_key, size);
966 if (ret)
967 goto out;
968
969 if (skip) {
970 u32 start =
971 btrfs_file_extent_calc_inline_size(0);
972 memmove(buf+start, buf+start+skip,
973 datal);
974 }
975
976 leaf = path->nodes[0];
977 slot = path->slots[0];
978 write_extent_buffer(leaf, buf,
979 btrfs_item_ptr_offset(leaf, slot),
980 size);
981 inode_add_bytes(inode, datal);
982 }
983
984 btrfs_mark_buffer_dirty(leaf);
985 }
986
987next:
988 btrfs_release_path(root, path);
989 key.offset++;
990 }
991 ret = 0;
992out:
993 btrfs_release_path(root, path);
994 if (ret == 0) {
995 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
996 if (destoff + olen > inode->i_size)
997 btrfs_i_size_write(inode, destoff + olen);
998 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
999 ret = btrfs_update_inode(trans, root, inode);
1000 }
1001 btrfs_end_transaction(trans, root);
1002 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1003 if (ret)
1004 vmtruncate(inode, 0);
1005out_unlock:
1006 mutex_unlock(&src->i_mutex);
1007 mutex_unlock(&inode->i_mutex);
1008 vfree(buf);
1009 btrfs_free_path(path);
1010out_fput:
1011 fput(src_file);
1012out_drop_write:
1013 mnt_drop_write(file->f_path.mnt);
1014 return ret;
1015}
1016
1017static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1018{
1019 struct btrfs_ioctl_clone_range_args args;
1020
1021 if (copy_from_user(&args, argp, sizeof(args)))
1022 return -EFAULT;
1023 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1024 args.src_length, args.dest_offset);
1025}
1026
1027/*
1028 * there are many ways the trans_start and trans_end ioctls can lead
1029 * to deadlocks. They should only be used by applications that
1030 * basically own the machine, and have a very in depth understanding
1031 * of all the possible deadlocks and enospc problems.
1032 */
1033static long btrfs_ioctl_trans_start(struct file *file)
1034{
1035 struct inode *inode = fdentry(file)->d_inode;
1036 struct btrfs_root *root = BTRFS_I(inode)->root;
1037 struct btrfs_trans_handle *trans;
1038 int ret = 0;
1039
1040 if (!capable(CAP_SYS_ADMIN))
1041 return -EPERM;
1042
1043 if (file->private_data) {
1044 ret = -EINPROGRESS;
1045 goto out;
1046 }
1047
1048 ret = mnt_want_write(file->f_path.mnt);
1049 if (ret)
1050 goto out;
1051
1052 mutex_lock(&root->fs_info->trans_mutex);
1053 root->fs_info->open_ioctl_trans++;
1054 mutex_unlock(&root->fs_info->trans_mutex);
1055
1056 trans = btrfs_start_ioctl_transaction(root, 0);
1057 if (trans)
1058 file->private_data = trans;
1059 else
1060 ret = -ENOMEM;
1061 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1062out:
1063 return ret;
1064}
1065
1066/*
1067 * there are many ways the trans_start and trans_end ioctls can lead
1068 * to deadlocks. They should only be used by applications that
1069 * basically own the machine, and have a very in depth understanding
1070 * of all the possible deadlocks and enospc problems.
1071 */
1072long btrfs_ioctl_trans_end(struct file *file)
1073{
1074 struct inode *inode = fdentry(file)->d_inode;
1075 struct btrfs_root *root = BTRFS_I(inode)->root;
1076 struct btrfs_trans_handle *trans;
1077 int ret = 0;
1078
1079 trans = file->private_data;
1080 if (!trans) {
1081 ret = -EINVAL;
1082 goto out;
1083 }
1084 btrfs_end_transaction(trans, root);
1085 file->private_data = NULL;
1086
1087 mutex_lock(&root->fs_info->trans_mutex);
1088 root->fs_info->open_ioctl_trans--;
1089 mutex_unlock(&root->fs_info->trans_mutex);
1090
1091 mnt_drop_write(file->f_path.mnt);
1092
1093out:
1094 return ret;
1095}
1096
1097long btrfs_ioctl(struct file *file, unsigned int
1098 cmd, unsigned long arg)
1099{
1100 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1101 void __user *argp = (void __user *)arg;
1102
1103 switch (cmd) {
1104 case BTRFS_IOC_SNAP_CREATE:
1105 return btrfs_ioctl_snap_create(file, argp, 0);
1106 case BTRFS_IOC_SUBVOL_CREATE:
1107 return btrfs_ioctl_snap_create(file, argp, 1);
1108 case BTRFS_IOC_DEFRAG:
1109 return btrfs_ioctl_defrag(file);
1110 case BTRFS_IOC_RESIZE:
1111 return btrfs_ioctl_resize(root, argp);
1112 case BTRFS_IOC_ADD_DEV:
1113 return btrfs_ioctl_add_dev(root, argp);
1114 case BTRFS_IOC_RM_DEV:
1115 return btrfs_ioctl_rm_dev(root, argp);
1116 case BTRFS_IOC_BALANCE:
1117 return btrfs_balance(root->fs_info->dev_root);
1118 case BTRFS_IOC_CLONE:
1119 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1120 case BTRFS_IOC_CLONE_RANGE:
1121 return btrfs_ioctl_clone_range(file, argp);
1122 case BTRFS_IOC_TRANS_START:
1123 return btrfs_ioctl_trans_start(file);
1124 case BTRFS_IOC_TRANS_END:
1125 return btrfs_ioctl_trans_end(file);
1126 case BTRFS_IOC_SYNC:
1127 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1128 return 0;
1129 }
1130
1131 return -ENOTTY;
1132}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..78049ea208db
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 3072
26
27struct btrfs_ioctl_vol_args {
28 __s64 fd;
29 char name[BTRFS_PATH_NAME_MAX + 1];
30};
31
32#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
33 struct btrfs_ioctl_vol_args)
34#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
35 struct btrfs_ioctl_vol_args)
36#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
37 struct btrfs_ioctl_vol_args)
38#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
39 struct btrfs_ioctl_vol_args)
40/* trans start and trans end are dangerous, and only for
41 * use by applications that know how to avoid the
42 * resulting deadlocks
43 */
44#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
45#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
46#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
47
48#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
49#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
50 struct btrfs_ioctl_vol_args)
51#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
52 struct btrfs_ioctl_vol_args)
53#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
54 struct btrfs_ioctl_vol_args)
55struct btrfs_ioctl_clone_range_args {
56 __s64 src_fd;
57 __u64 src_offset, src_length;
58 __u64 dest_offset;
59};
60
61#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
62 struct btrfs_ioctl_clone_range_args)
63
64#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
65 struct btrfs_ioctl_vol_args)
66
67#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */
37
38int btrfs_tree_lock(struct extent_buffer *eb)
39{
40 int i;
41
42 if (mutex_trylock(&eb->mutex))
43 return 0;
44 for (i = 0; i < 512; i++) {
45 cpu_relax();
46 if (mutex_trylock(&eb->mutex))
47 return 0;
48 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0;
52}
53
54int btrfs_try_tree_lock(struct extent_buffer *eb)
55{
56 return mutex_trylock(&eb->mutex);
57}
58
59int btrfs_tree_unlock(struct extent_buffer *eb)
60{
61 mutex_unlock(&eb->mutex);
62 return 0;
63}
64
65int btrfs_tree_locked(struct extent_buffer *eb)
66{
67 return mutex_is_locked(&eb->mutex);
68}
69
70/*
71 * btrfs_search_slot uses this to decide if it should drop its locks
72 * before doing something expensive like allocating free blocks for cow.
73 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{
76 int i;
77 struct extent_buffer *eb;
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i];
80 if (!eb)
81 break;
82 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list))
84 return 1;
85 }
86 return 0;
87}
88
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node **p = &root->rb_node;
43 struct rb_node *parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while (*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node *n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while (n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while (prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while (prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, u64 disk_len, int type)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->disk_len = disk_len;
184 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags);
187
188 /* one ref for the tree */
189 atomic_set(&entry->refs, 1);
190 init_waitqueue_head(&entry->wait);
191 INIT_LIST_HEAD(&entry->list);
192 INIT_LIST_HEAD(&entry->root_extent_list);
193
194 node = tree_insert(&tree->tree, file_offset,
195 &entry->rb_node);
196 BUG_ON(node);
197
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents);
204 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
205
206 mutex_unlock(&tree->mutex);
207 BUG_ON(node);
208 return 0;
209}
210
211/*
212 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
213 * when an ordered extent is finished. If the list covers more than one
214 * ordered extent, it is split across multiples.
215 */
216int btrfs_add_ordered_sum(struct inode *inode,
217 struct btrfs_ordered_extent *entry,
218 struct btrfs_ordered_sum *sum)
219{
220 struct btrfs_ordered_inode_tree *tree;
221
222 tree = &BTRFS_I(inode)->ordered_tree;
223 mutex_lock(&tree->mutex);
224 list_add_tail(&sum->list, &entry->list);
225 mutex_unlock(&tree->mutex);
226 return 0;
227}
228
229/*
230 * this is used to account for finished IO across a given range
231 * of the file. The IO should not span ordered extents. If
232 * a given ordered_extent is completely done, 1 is returned, otherwise
233 * 0.
234 *
235 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
236 * to make sure this function only returns 1 once for a given ordered extent.
237 */
238int btrfs_dec_test_ordered_pending(struct inode *inode,
239 u64 file_offset, u64 io_size)
240{
241 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node;
243 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret;
246
247 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset);
252 if (!node) {
253 ret = 1;
254 goto out;
255 }
256
257 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
258 if (!offset_in_entry(entry, file_offset)) {
259 ret = 1;
260 goto out;
261 }
262
263 ret = test_range_bit(io_tree, entry->file_offset,
264 entry->file_offset + entry->len - 1,
265 EXTENT_ORDERED, 0);
266 if (ret == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
268out:
269 mutex_unlock(&tree->mutex);
270 return ret == 0;
271}
272
273/*
274 * used to drop a reference on an ordered extent. This will free
275 * the extent if the last reference is dropped
276 */
277int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
278{
279 struct list_head *cur;
280 struct btrfs_ordered_sum *sum;
281
282 if (atomic_dec_and_test(&entry->refs)) {
283 while (!list_empty(&entry->list)) {
284 cur = entry->list.next;
285 sum = list_entry(cur, struct btrfs_ordered_sum, list);
286 list_del(&sum->list);
287 kfree(sum);
288 }
289 kfree(entry);
290 }
291 return 0;
292}
293
294/*
295 * remove an ordered extent from the tree. No references are dropped
296 * but, anyone waiting on this extent is woken up.
297 */
298int btrfs_remove_ordered_extent(struct inode *inode,
299 struct btrfs_ordered_extent *entry)
300{
301 struct btrfs_ordered_inode_tree *tree;
302 struct rb_node *node;
303
304 tree = &BTRFS_I(inode)->ordered_tree;
305 mutex_lock(&tree->mutex);
306 node = &entry->rb_node;
307 rb_erase(node, &tree->tree);
308 tree->last = NULL;
309 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list);
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314
315 mutex_unlock(&tree->mutex);
316 wake_up(&entry->wait);
317 return 0;
318}
319
320/*
321 * wait for all the ordered extents in a root. This is done when balancing
322 * space between drives.
323 */
324int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
325{
326 struct list_head splice;
327 struct list_head *cur;
328 struct btrfs_ordered_extent *ordered;
329 struct inode *inode;
330
331 INIT_LIST_HEAD(&splice);
332
333 spin_lock(&root->fs_info->ordered_extent_lock);
334 list_splice_init(&root->fs_info->ordered_extents, &splice);
335 while (!list_empty(&splice)) {
336 cur = splice.next;
337 ordered = list_entry(cur, struct btrfs_ordered_extent,
338 root_extent_list);
339 if (nocow_only &&
340 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
341 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
342 list_move(&ordered->root_extent_list,
343 &root->fs_info->ordered_extents);
344 cond_resched_lock(&root->fs_info->ordered_extent_lock);
345 continue;
346 }
347
348 list_del_init(&ordered->root_extent_list);
349 atomic_inc(&ordered->refs);
350
351 /*
352 * the inode may be getting freed (in sys_unlink path).
353 */
354 inode = igrab(ordered->inode);
355
356 spin_unlock(&root->fs_info->ordered_extent_lock);
357
358 if (inode) {
359 btrfs_start_ordered_extent(inode, ordered, 1);
360 btrfs_put_ordered_extent(ordered);
361 iput(inode);
362 } else {
363 btrfs_put_ordered_extent(ordered);
364 }
365
366 spin_lock(&root->fs_info->ordered_extent_lock);
367 }
368 spin_unlock(&root->fs_info->ordered_extent_lock);
369 return 0;
370}
371
372/*
373 * Used to start IO or wait for a given ordered extent to finish.
374 *
375 * If wait is one, this effectively waits on page writeback for all the pages
376 * in the extent, and it waits on the io completion code to insert
377 * metadata into the btree corresponding to the extent
378 */
379void btrfs_start_ordered_extent(struct inode *inode,
380 struct btrfs_ordered_extent *entry,
381 int wait)
382{
383 u64 start = entry->file_offset;
384 u64 end = start + entry->len - 1;
385
386 /*
387 * pages in the range can be dirty, clean or writeback. We
388 * start IO on any dirty ones so the wait doesn't stall waiting
389 * for pdflush to find them
390 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
392 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags));
395 }
396}
397
398/*
399 * Used to wait on ordered extents across a large range of bytes.
400 */
401int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
402{
403 u64 end;
404 u64 orig_end;
405 u64 wait_end;
406 struct btrfs_ordered_extent *ordered;
407
408 if (start + len < start) {
409 orig_end = INT_LIMIT(loff_t);
410 } else {
411 orig_end = start + len - 1;
412 if (orig_end > INT_LIMIT(loff_t))
413 orig_end = INT_LIMIT(loff_t);
414 }
415 wait_end = orig_end;
416again:
417 /* start IO across the range first to instantiate any delalloc
418 * extents
419 */
420 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
421
422 /* The compression code will leave pages locked but return from
423 * writepage without setting the page writeback. Starting again
424 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
425 */
426 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
427
428 btrfs_wait_on_page_writeback_range(inode->i_mapping,
429 start >> PAGE_CACHE_SHIFT,
430 orig_end >> PAGE_CACHE_SHIFT);
431
432 end = orig_end;
433 while (1) {
434 ordered = btrfs_lookup_first_ordered_extent(inode, end);
435 if (!ordered)
436 break;
437 if (ordered->file_offset > orig_end) {
438 btrfs_put_ordered_extent(ordered);
439 break;
440 }
441 if (ordered->file_offset + ordered->len < start) {
442 btrfs_put_ordered_extent(ordered);
443 break;
444 }
445 btrfs_start_ordered_extent(inode, ordered, 1);
446 end = ordered->file_offset;
447 btrfs_put_ordered_extent(ordered);
448 if (end == 0 || end == start)
449 break;
450 end--;
451 }
452 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
453 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
454 schedule_timeout(1);
455 goto again;
456 }
457 return 0;
458}
459
460/*
461 * find an ordered extent corresponding to file_offset. return NULL if
462 * nothing is found, otherwise take a reference on the extent and return it
463 */
464struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
465 u64 file_offset)
466{
467 struct btrfs_ordered_inode_tree *tree;
468 struct rb_node *node;
469 struct btrfs_ordered_extent *entry = NULL;
470
471 tree = &BTRFS_I(inode)->ordered_tree;
472 mutex_lock(&tree->mutex);
473 node = tree_search(tree, file_offset);
474 if (!node)
475 goto out;
476
477 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
478 if (!offset_in_entry(entry, file_offset))
479 entry = NULL;
480 if (entry)
481 atomic_inc(&entry->refs);
482out:
483 mutex_unlock(&tree->mutex);
484 return entry;
485}
486
487/*
488 * lookup and return any extent before 'file_offset'. NULL is returned
489 * if none is found
490 */
491struct btrfs_ordered_extent *
492btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
493{
494 struct btrfs_ordered_inode_tree *tree;
495 struct rb_node *node;
496 struct btrfs_ordered_extent *entry = NULL;
497
498 tree = &BTRFS_I(inode)->ordered_tree;
499 mutex_lock(&tree->mutex);
500 node = tree_search(tree, file_offset);
501 if (!node)
502 goto out;
503
504 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
505 atomic_inc(&entry->refs);
506out:
507 mutex_unlock(&tree->mutex);
508 return entry;
509}
510
511/*
512 * After an extent is done, call this to conditionally update the on disk
513 * i_size. i_size is updated to cover any fully written part of the file.
514 */
515int btrfs_ordered_update_i_size(struct inode *inode,
516 struct btrfs_ordered_extent *ordered)
517{
518 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
519 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
520 u64 disk_i_size;
521 u64 new_i_size;
522 u64 i_size_test;
523 struct rb_node *node;
524 struct btrfs_ordered_extent *test;
525
526 mutex_lock(&tree->mutex);
527 disk_i_size = BTRFS_I(inode)->disk_i_size;
528
529 /*
530 * if the disk i_size is already at the inode->i_size, or
531 * this ordered extent is inside the disk i_size, we're done
532 */
533 if (disk_i_size >= inode->i_size ||
534 ordered->file_offset + ordered->len <= disk_i_size) {
535 goto out;
536 }
537
538 /*
539 * we can't update the disk_isize if there are delalloc bytes
540 * between disk_i_size and this ordered extent
541 */
542 if (test_range_bit(io_tree, disk_i_size,
543 ordered->file_offset + ordered->len - 1,
544 EXTENT_DELALLOC, 0)) {
545 goto out;
546 }
547 /*
548 * walk backward from this ordered extent to disk_i_size.
549 * if we find an ordered extent then we can't update disk i_size
550 * yet
551 */
552 node = &ordered->rb_node;
553 while (1) {
554 node = rb_prev(node);
555 if (!node)
556 break;
557 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
558 if (test->file_offset + test->len <= disk_i_size)
559 break;
560 if (test->file_offset >= inode->i_size)
561 break;
562 if (test->file_offset >= disk_i_size)
563 goto out;
564 }
565 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
566
567 /*
568 * at this point, we know we can safely update i_size to at least
569 * the offset from this ordered extent. But, we need to
570 * walk forward and see if ios from higher up in the file have
571 * finished.
572 */
573 node = rb_next(&ordered->rb_node);
574 i_size_test = 0;
575 if (node) {
576 /*
577 * do we have an area where IO might have finished
578 * between our ordered extent and the next one.
579 */
580 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
581 if (test->file_offset > entry_end(ordered))
582 i_size_test = test->file_offset;
583 } else {
584 i_size_test = i_size_read(inode);
585 }
586
587 /*
588 * i_size_test is the end of a region after this ordered
589 * extent where there are no ordered extents. As long as there
590 * are no delalloc bytes in this area, it is safe to update
591 * disk_i_size to the end of the region.
592 */
593 if (i_size_test > entry_end(ordered) &&
594 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
595 EXTENT_DELALLOC, 0)) {
596 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
597 }
598 BTRFS_I(inode)->disk_i_size = new_i_size;
599out:
600 mutex_unlock(&tree->mutex);
601 return 0;
602}
603
604/*
605 * search the ordered extents for one corresponding to 'offset' and
606 * try to find a checksum. This is used because we allow pages to
607 * be reclaimed before their checksum is actually put into the btree
608 */
609int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
610 u32 *sum)
611{
612 struct btrfs_ordered_sum *ordered_sum;
613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors;
618 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
620 int ret = 1;
621
622 ordered = btrfs_lookup_ordered_extent(inode, offset);
623 if (!ordered)
624 return 1;
625
626 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums;
632 for (i = 0; i < num_sectors; i++) {
633 if (sector_sums[i].bytenr == disk_bytenr) {
634 *sum = sector_sums[i].sum;
635 ret = 0;
636 goto out;
637 }
638 }
639 }
640 }
641out:
642 mutex_unlock(&tree->mutex);
643 btrfs_put_ordered_extent(ordered);
644 return ret;
645}
646
647
648/**
649 * taken from mm/filemap.c because it isn't exported
650 *
651 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
652 * @mapping: address space structure to write
653 * @start: offset in bytes where the range starts
654 * @end: offset in bytes where the range ends (inclusive)
655 * @sync_mode: enable synchronous operation
656 *
657 * Start writeback against all of a mapping's dirty pages that lie
658 * within the byte offsets <start, end> inclusive.
659 *
660 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
661 * opposed to a regular memory cleansing writeback. The difference between
662 * these two operations is that if a dirty page/buffer is encountered, it must
663 * be waited upon, and not just skipped over.
664 */
665int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
666 loff_t end, int sync_mode)
667{
668 struct writeback_control wbc = {
669 .sync_mode = sync_mode,
670 .nr_to_write = mapping->nrpages * 2,
671 .range_start = start,
672 .range_end = end,
673 .for_writepages = 1,
674 };
675 return btrfs_writepages(mapping, &wbc);
676}
677
678/**
679 * taken from mm/filemap.c because it isn't exported
680 *
681 * wait_on_page_writeback_range - wait for writeback to complete
682 * @mapping: target address_space
683 * @start: beginning page index
684 * @end: ending page index
685 *
686 * Wait for writeback to complete against pages indexed by start->end
687 * inclusive
688 */
689int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
690 pgoff_t start, pgoff_t end)
691{
692 struct pagevec pvec;
693 int nr_pages;
694 int ret = 0;
695 pgoff_t index;
696
697 if (end < start)
698 return 0;
699
700 pagevec_init(&pvec, 0);
701 index = start;
702 while ((index <= end) &&
703 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
704 PAGECACHE_TAG_WRITEBACK,
705 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
706 unsigned i;
707
708 for (i = 0; i < nr_pages; i++) {
709 struct page *page = pvec.pages[i];
710
711 /* until radix tree lookup accepts end_index */
712 if (page->index > end)
713 continue;
714
715 wait_on_page_writeback(page);
716 if (PageError(page))
717 ret = -EIO;
718 }
719 pagevec_release(&pvec);
720 cond_resched();
721 }
722
723 /* Check for outstanding write errors */
724 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
725 ret = -ENOSPC;
726 if (test_and_clear_bit(AS_EIO, &mapping->flags))
727 ret = -EIO;
728
729 return ret;
730}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */
43 u64 bytenr;
44
45 /*
46 * this is the length in bytes covered by the sums array below.
47 */
48 unsigned long len;
49 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */
51 struct btrfs_sector_sum sums[];
52};
53
54/*
55 * bits for the flags field:
56 *
57 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
58 * It is used to make sure metadata is inserted into the tree only once
59 * per extent.
60 *
61 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
62 * rbtree, just before waking any waiters. It is used to indicate the
63 * IO is done and any metadata is inserted into the tree.
64 */
65#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
66
67#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74
75struct btrfs_ordered_extent {
76 /* logical offset in the file */
77 u64 file_offset;
78
79 /* disk byte number */
80 u64 start;
81
82 /* ram length of the extent in bytes */
83 u64 len;
84
85 /* extent length on disk */
86 u64 disk_len;
87
88 /* flags (described above) */
89 unsigned long flags;
90
91 /* reference count */
92 atomic_t refs;
93
94 /* the inode we belong to */
95 struct inode *inode;
96
97 /* list of checksums for insertion when the extent io is done */
98 struct list_head list;
99
100 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
101 wait_queue_head_t wait;
102
103 /* our friendly rbtree entry */
104 struct rb_node rb_node;
105
106 /* a per root list of all the pending ordered extents */
107 struct list_head root_extent_list;
108};
109
110
111/*
112 * calculates the total size you need to allocate for an ordered sum
113 * structure spanning 'bytes' in the file
114 */
115static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
116 unsigned long bytes)
117{
118 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
119 root->sectorsize;
120 num_sectors++;
121 return sizeof(struct btrfs_ordered_sum) +
122 num_sectors * sizeof(struct btrfs_sector_sum);
123}
124
125static inline void
126btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
127{
128 mutex_init(&t->mutex);
129 t->tree.rb_node = NULL;
130 t->last = NULL;
131}
132
133int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
134int btrfs_remove_ordered_extent(struct inode *inode,
135 struct btrfs_ordered_extent *entry);
136int btrfs_dec_test_ordered_pending(struct inode *inode,
137 u64 file_offset, u64 io_size);
138int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
139 u64 start, u64 len, u64 disk_len, int tyep);
140int btrfs_add_ordered_sum(struct inode *inode,
141 struct btrfs_ordered_extent *entry,
142 struct btrfs_ordered_sum *sum);
143struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
144 u64 file_offset);
145void btrfs_start_ordered_extent(struct inode *inode,
146 struct btrfs_ordered_extent *entry, int wait);
147int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
148struct btrfs_ordered_extent *
149btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
28 "num_stripes %d\n",
29 (unsigned long long)btrfs_chunk_length(eb, chunk),
30 (unsigned long long)btrfs_chunk_owner(eb, chunk),
31 (unsigned long long)btrfs_chunk_type(eb, chunk),
32 num_stripes);
33 for (i = 0 ; i < num_stripes ; i++) {
34 printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
35 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
36 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
37 }
38}
39static void print_dev_item(struct extent_buffer *eb,
40 struct btrfs_dev_item *dev_item)
41{
42 printk(KERN_INFO "\t\tdev item devid %llu "
43 "total_bytes %llu bytes used %llu\n",
44 (unsigned long long)btrfs_device_id(eb, dev_item),
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47}
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{
50 int i;
51 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi;
59 struct btrfs_key key;
60 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr,
67 btrfs_leaf_free_space(root, l));
68 for (i = 0 ; i < nr ; i++) {
69 item = btrfs_item_nr(l, i);
70 btrfs_item_key_to_cpu(l, &key, i);
71 type = btrfs_key_type(&key);
72 printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
73 "itemsize %d\n",
74 i,
75 (unsigned long long)key.objectid, type,
76 (unsigned long long)key.offset,
77 btrfs_item_offset(l, item), btrfs_item_size(l, item));
78 switch (type) {
79 case BTRFS_INODE_ITEM_KEY:
80 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
81 printk(KERN_INFO "\t\tinode generation %llu size %llu "
82 "mode %o\n",
83 (unsigned long long)
84 btrfs_inode_generation(l, ii),
85 (unsigned long long)btrfs_inode_size(l, ii),
86 btrfs_inode_mode(l, ii));
87 break;
88 case BTRFS_DIR_ITEM_KEY:
89 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
90 btrfs_dir_item_key_to_cpu(l, di, &found_key);
91 printk(KERN_INFO "\t\tdir oid %llu type %u\n",
92 (unsigned long long)found_key.objectid,
93 btrfs_dir_type(l, di));
94 break;
95 case BTRFS_ROOT_ITEM_KEY:
96 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
97 printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
98 (unsigned long long)
99 btrfs_disk_root_bytenr(l, ri),
100 btrfs_disk_root_refs(l, ri));
101 break;
102 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
104 printk(KERN_INFO "\t\textent data refs %u\n",
105 btrfs_extent_refs(l, ei));
106 break;
107 case BTRFS_EXTENT_REF_KEY:
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
109 printk(KERN_INFO "\t\textent back ref root %llu "
110 "gen %llu owner %llu num_refs %lu\n",
111 (unsigned long long)btrfs_ref_root(l, ref),
112 (unsigned long long)btrfs_ref_generation(l, ref),
113 (unsigned long long)btrfs_ref_objectid(l, ref),
114 (unsigned long)btrfs_ref_num_refs(l, ref));
115 break;
116
117 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item);
120 if (btrfs_file_extent_type(l, fi) ==
121 BTRFS_FILE_EXTENT_INLINE) {
122 printk(KERN_INFO "\t\tinline extent data "
123 "size %u\n",
124 btrfs_file_extent_inline_len(l, fi));
125 break;
126 }
127 printk(KERN_INFO "\t\textent data disk bytenr %llu "
128 "nr %llu\n",
129 (unsigned long long)
130 btrfs_file_extent_disk_bytenr(l, fi),
131 (unsigned long long)
132 btrfs_file_extent_disk_num_bytes(l, fi));
133 printk(KERN_INFO "\t\textent data offset %llu "
134 "nr %llu ram %llu\n",
135 (unsigned long long)
136 btrfs_file_extent_offset(l, fi),
137 (unsigned long long)
138 btrfs_file_extent_num_bytes(l, fi),
139 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi));
141 break;
142 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item);
145 printk(KERN_INFO "\t\tblock group used %llu\n",
146 (unsigned long long)
147 btrfs_disk_block_group_used(l, bi));
148 break;
149 case BTRFS_CHUNK_ITEM_KEY:
150 print_chunk(l, btrfs_item_ptr(l, i,
151 struct btrfs_chunk));
152 break;
153 case BTRFS_DEV_ITEM_KEY:
154 print_dev_item(l, btrfs_item_ptr(l, i,
155 struct btrfs_dev_item));
156 break;
157 case BTRFS_DEV_EXTENT_KEY:
158 dev_extent = btrfs_item_ptr(l, i,
159 struct btrfs_dev_extent);
160 printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
161 "\t\tchunk objectid %llu chunk offset %llu "
162 "length %llu\n",
163 (unsigned long long)
164 btrfs_dev_extent_chunk_tree(l, dev_extent),
165 (unsigned long long)
166 btrfs_dev_extent_chunk_objectid(l, dev_extent),
167 (unsigned long long)
168 btrfs_dev_extent_chunk_offset(l, dev_extent),
169 (unsigned long long)
170 btrfs_dev_extent_length(l, dev_extent));
171 };
172 }
173}
174
175void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
176{
177 int i; u32 nr;
178 struct btrfs_key key;
179 int level;
180
181 if (!c)
182 return;
183 nr = btrfs_header_nritems(c);
184 level = btrfs_header_level(c);
185 if (level == 0) {
186 btrfs_print_leaf(root, c);
187 return;
188 }
189 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
190 (unsigned long long)btrfs_header_bytenr(c),
191 btrfs_header_level(c), nr,
192 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
193 for (i = 0; i < nr; i++) {
194 btrfs_node_key_to_cpu(c, &key, i);
195 printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
196 i,
197 (unsigned long long)key.objectid,
198 key.type,
199 (unsigned long long)key.offset,
200 (unsigned long long)btrfs_node_blockptr(c, i));
201 }
202 for (i = 0; i < nr; i++) {
203 struct extent_buffer *next = read_tree_block(root,
204 btrfs_node_blockptr(c, i),
205 btrfs_level_size(root, level - 1),
206 btrfs_node_ptr_generation(c, i));
207 if (btrfs_is_leaf(next) &&
208 btrfs_header_level(c) != 1)
209 BUG();
210 if (btrfs_header_level(next) !=
211 btrfs_header_level(c) - 1)
212 BUG();
213 btrfs_print_tree(root, next);
214 free_extent_buffer(next);
215 }
216}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node **p = &root->rb_node;
78 struct rb_node *parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while (*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node *n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while (n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while (!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = BTRFS_ROOT_ITEM_KEY;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
136 (unsigned long long)key->objectid, key->type,
137 (unsigned long long)key->offset);
138 BUG_ON(1);
139 }
140
141 l = path->nodes[0];
142 slot = path->slots[0];
143 ptr = btrfs_item_ptr_offset(l, slot);
144 write_extent_buffer(l, item, ptr, sizeof(*item));
145 btrfs_mark_buffer_dirty(path->nodes[0]);
146out:
147 btrfs_release_path(root, path);
148 btrfs_free_path(path);
149 return ret;
150}
151
152int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
153 *root, struct btrfs_key *key, struct btrfs_root_item
154 *item)
155{
156 int ret;
157 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
158 return ret;
159}
160
161/*
162 * at mount time we want to find all the old transaction snapshots that were in
163 * the process of being deleted if we crashed. This is any root item with an
164 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed.
166 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
168 struct btrfs_root *latest)
169{
170 struct btrfs_root *dead_root;
171 struct btrfs_item *item;
172 struct btrfs_root_item *ri;
173 struct btrfs_key key;
174 struct btrfs_key found_key;
175 struct btrfs_path *path;
176 int ret;
177 u32 nritems;
178 struct extent_buffer *leaf;
179 int slot;
180
181 key.objectid = objectid;
182 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
183 key.offset = 0;
184 path = btrfs_alloc_path();
185 if (!path)
186 return -ENOMEM;
187
188again:
189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
190 if (ret < 0)
191 goto err;
192 while (1) {
193 leaf = path->nodes[0];
194 nritems = btrfs_header_nritems(leaf);
195 slot = path->slots[0];
196 if (slot >= nritems) {
197 ret = btrfs_next_leaf(root, path);
198 if (ret)
199 break;
200 leaf = path->nodes[0];
201 nritems = btrfs_header_nritems(leaf);
202 slot = path->slots[0];
203 }
204 item = btrfs_item_nr(leaf, slot);
205 btrfs_item_key_to_cpu(leaf, &key, slot);
206 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
207 goto next;
208
209 if (key.objectid < objectid)
210 goto next;
211
212 if (key.objectid > objectid)
213 break;
214
215 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
216 if (btrfs_disk_root_refs(leaf, ri) != 0)
217 goto next;
218
219 memcpy(&found_key, &key, sizeof(key));
220 key.offset++;
221 btrfs_release_path(root, path);
222 dead_root =
223 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
224 &found_key);
225 if (IS_ERR(dead_root)) {
226 ret = PTR_ERR(dead_root);
227 goto err;
228 }
229
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret)
235 goto err;
236 goto again;
237next:
238 slot++;
239 path->slots[0]++;
240 }
241 ret = 0;
242err:
243 btrfs_free_path(path);
244 return ret;
245}
246
247/* drop the root item for 'key' from 'root' */
248int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
249 struct btrfs_key *key)
250{
251 struct btrfs_path *path;
252 int ret;
253 u32 refs;
254 struct btrfs_root_item *ri;
255 struct extent_buffer *leaf;
256
257 path = btrfs_alloc_path();
258 BUG_ON(!path);
259 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
260 if (ret < 0)
261 goto out;
262
263 BUG_ON(ret != 0);
264 leaf = path->nodes[0];
265 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
266
267 refs = btrfs_disk_root_refs(leaf, ri);
268 BUG_ON(refs != 0);
269 ret = btrfs_del_item(trans, root, path);
270out:
271 btrfs_release_path(root, path);
272 btrfs_free_path(path);
273 return ret;
274}
275
276#if 0 /* this will get used when snapshot deletion is implemented */
277int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
278 struct btrfs_root *tree_root,
279 u64 root_id, u8 type, u64 ref_id)
280{
281 struct btrfs_key key;
282 int ret;
283 struct btrfs_path *path;
284
285 path = btrfs_alloc_path();
286
287 key.objectid = root_id;
288 key.type = type;
289 key.offset = ref_id;
290
291 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
292 BUG_ON(ret);
293
294 ret = btrfs_del_item(trans, tree_root, path);
295 BUG_ON(ret);
296
297 btrfs_free_path(path);
298 return ret;
299}
300#endif
301
302int btrfs_find_root_ref(struct btrfs_root *tree_root,
303 struct btrfs_path *path,
304 u64 root_id, u64 ref_id)
305{
306 struct btrfs_key key;
307 int ret;
308
309 key.objectid = root_id;
310 key.type = BTRFS_ROOT_REF_KEY;
311 key.offset = ref_id;
312
313 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
314 return ret;
315}
316
317
318/*
319 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
320 * or BTRFS_ROOT_BACKREF_KEY.
321 *
322 * The dirid, sequence, name and name_len refer to the directory entry
323 * that is referencing the root.
324 *
325 * For a forward ref, the root_id is the id of the tree referencing
326 * the root and ref_id is the id of the subvol or snapshot.
327 *
328 * For a back ref the root_id is the id of the subvol or snapshot and
329 * ref_id is the id of the tree referencing it.
330 */
331int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
332 struct btrfs_root *tree_root,
333 u64 root_id, u8 type, u64 ref_id,
334 u64 dirid, u64 sequence,
335 const char *name, int name_len)
336{
337 struct btrfs_key key;
338 int ret;
339 struct btrfs_path *path;
340 struct btrfs_root_ref *ref;
341 struct extent_buffer *leaf;
342 unsigned long ptr;
343
344
345 path = btrfs_alloc_path();
346
347 key.objectid = root_id;
348 key.type = type;
349 key.offset = ref_id;
350
351 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
352 sizeof(*ref) + name_len);
353 BUG_ON(ret);
354
355 leaf = path->nodes[0];
356 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
357 btrfs_set_root_ref_dirid(leaf, ref, dirid);
358 btrfs_set_root_ref_sequence(leaf, ref, sequence);
359 btrfs_set_root_ref_name_len(leaf, ref, name_len);
360 ptr = (unsigned long)(ref + 1);
361 write_extent_buffer(leaf, name, ptr, name_len);
362 btrfs_mark_buffer_dirty(leaf);
363
364 btrfs_free_path(path);
365 return ret;
366}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */
43
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
47u##bits btrfs_##name(struct extent_buffer *eb, \
48 type *s) \
49{ \
50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \
53 /* ugly, but we want the fast path here */ \
54 if (eb->map_token && offset >= eb->map_start && \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \
56 eb->map_len) { \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
58 return le##bits##_to_cpu(p->member); \
59 } \
60 { \
61 int err; \
62 char *map_token; \
63 char *kaddr; \
64 int unmap_on_exit = (eb->map_token == NULL); \
65 unsigned long map_start; \
66 unsigned long map_len; \
67 u##bits res; \
68 err = map_extent_buffer(eb, offset, \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \
84void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \
86{ \
87 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \
90 /* ugly, but we want the fast path here */ \
91 if (eb->map_token && offset >= eb->map_start && \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \
93 eb->map_len) { \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
95 p->member = cpu_to_le##bits(val); \
96 return; \
97 } \
98 { \
99 int err; \
100 char *map_token; \
101 char *kaddr; \
102 int unmap_on_exit = (eb->map_token == NULL); \
103 unsigned long map_start; \
104 unsigned long map_len; \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120}
121
122#include "ctree.h"
123
124void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr)
126{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key);
139}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..0a14b495532f
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,722 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "xattr.h"
49#include "volumes.h"
50#include "version.h"
51#include "export.h"
52#include "compression.h"
53
54#define BTRFS_SUPER_MAGIC 0x9123683E
55
56static struct super_operations btrfs_super_ops;
57
58static void btrfs_put_super(struct super_block *sb)
59{
60 struct btrfs_root *root = btrfs_sb(sb);
61 int ret;
62
63 ret = close_ctree(root);
64 sb->s_fs_info = NULL;
65}
66
67enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
71};
72
73static match_table_t tokens = {
74 {Opt_degraded, "degraded"},
75 {Opt_subvol, "subvol=%s"},
76 {Opt_device, "device=%s"},
77 {Opt_nodatasum, "nodatasum"},
78 {Opt_nodatacow, "nodatacow"},
79 {Opt_nobarrier, "nobarrier"},
80 {Opt_max_extent, "max_extent=%s"},
81 {Opt_max_inline, "max_inline=%s"},
82 {Opt_alloc_start, "alloc_start=%s"},
83 {Opt_thread_pool, "thread_pool=%d"},
84 {Opt_compress, "compress"},
85 {Opt_ssd, "ssd"},
86 {Opt_noacl, "noacl"},
87 {Opt_err, NULL},
88};
89
90u64 btrfs_parse_size(char *str)
91{
92 u64 res;
93 int mult = 1;
94 char *end;
95 char last;
96
97 res = simple_strtoul(str, &end, 10);
98
99 last = end[0];
100 if (isalpha(last)) {
101 last = tolower(last);
102 switch (last) {
103 case 'g':
104 mult *= 1024;
105 case 'm':
106 mult *= 1024;
107 case 'k':
108 mult *= 1024;
109 }
110 res = res * mult;
111 }
112 return res;
113}
114
115/*
116 * Regular mount options parser. Everything that is needed only when
117 * reading in a new superblock is parsed here.
118 */
119int btrfs_parse_options(struct btrfs_root *root, char *options)
120{
121 struct btrfs_fs_info *info = root->fs_info;
122 substring_t args[MAX_OPT_ARGS];
123 char *p, *num;
124 int intarg;
125
126 if (!options)
127 return 0;
128
129 /*
130 * strsep changes the string, duplicate it because parse_options
131 * gets called twice
132 */
133 options = kstrdup(options, GFP_NOFS);
134 if (!options)
135 return -ENOMEM;
136
137
138 while ((p = strsep(&options, ",")) != NULL) {
139 int token;
140 if (!*p)
141 continue;
142
143 token = match_token(p, tokens, args);
144 switch (token) {
145 case Opt_degraded:
146 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
147 btrfs_set_opt(info->mount_opt, DEGRADED);
148 break;
149 case Opt_subvol:
150 case Opt_device:
151 /*
152 * These are parsed by btrfs_parse_early_options
153 * and can be happily ignored here.
154 */
155 break;
156 case Opt_nodatasum:
157 printk(KERN_INFO "btrfs: setting nodatacsum\n");
158 btrfs_set_opt(info->mount_opt, NODATASUM);
159 break;
160 case Opt_nodatacow:
161 printk(KERN_INFO "btrfs: setting nodatacow\n");
162 btrfs_set_opt(info->mount_opt, NODATACOW);
163 btrfs_set_opt(info->mount_opt, NODATASUM);
164 break;
165 case Opt_compress:
166 printk(KERN_INFO "btrfs: use compression\n");
167 btrfs_set_opt(info->mount_opt, COMPRESS);
168 break;
169 case Opt_ssd:
170 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
171 btrfs_set_opt(info->mount_opt, SSD);
172 break;
173 case Opt_nobarrier:
174 printk(KERN_INFO "btrfs: turning off barriers\n");
175 btrfs_set_opt(info->mount_opt, NOBARRIER);
176 break;
177 case Opt_thread_pool:
178 intarg = 0;
179 match_int(&args[0], &intarg);
180 if (intarg) {
181 info->thread_pool_size = intarg;
182 printk(KERN_INFO "btrfs: thread pool %d\n",
183 info->thread_pool_size);
184 }
185 break;
186 case Opt_max_extent:
187 num = match_strdup(&args[0]);
188 if (num) {
189 info->max_extent = btrfs_parse_size(num);
190 kfree(num);
191
192 info->max_extent = max_t(u64,
193 info->max_extent, root->sectorsize);
194 printk(KERN_INFO "btrfs: max_extent at %llu\n",
195 info->max_extent);
196 }
197 break;
198 case Opt_max_inline:
199 num = match_strdup(&args[0]);
200 if (num) {
201 info->max_inline = btrfs_parse_size(num);
202 kfree(num);
203
204 if (info->max_inline) {
205 info->max_inline = max_t(u64,
206 info->max_inline,
207 root->sectorsize);
208 }
209 printk(KERN_INFO "btrfs: max_inline at %llu\n",
210 info->max_inline);
211 }
212 break;
213 case Opt_alloc_start:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->alloc_start = btrfs_parse_size(num);
217 kfree(num);
218 printk(KERN_INFO
219 "btrfs: allocations start at %llu\n",
220 info->alloc_start);
221 }
222 break;
223 case Opt_noacl:
224 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
225 break;
226 default:
227 break;
228 }
229 }
230 kfree(options);
231 return 0;
232}
233
234/*
235 * Parse mount options that are required early in the mount process.
236 *
237 * All other options will be parsed on much later in the mount process and
238 * only when we need to allocate a new super block.
239 */
240static int btrfs_parse_early_options(const char *options, fmode_t flags,
241 void *holder, char **subvol_name,
242 struct btrfs_fs_devices **fs_devices)
243{
244 substring_t args[MAX_OPT_ARGS];
245 char *opts, *p;
246 int error = 0;
247
248 if (!options)
249 goto out;
250
251 /*
252 * strsep changes the string, duplicate it because parse_options
253 * gets called twice
254 */
255 opts = kstrdup(options, GFP_KERNEL);
256 if (!opts)
257 return -ENOMEM;
258
259 while ((p = strsep(&opts, ",")) != NULL) {
260 int token;
261 if (!*p)
262 continue;
263
264 token = match_token(p, tokens, args);
265 switch (token) {
266 case Opt_subvol:
267 *subvol_name = match_strdup(&args[0]);
268 break;
269 case Opt_device:
270 error = btrfs_scan_one_device(match_strdup(&args[0]),
271 flags, holder, fs_devices);
272 if (error)
273 goto out_free_opts;
274 break;
275 default:
276 break;
277 }
278 }
279
280 out_free_opts:
281 kfree(opts);
282 out:
283 /*
284 * If no subvolume name is specified we use the default one. Allocate
285 * a copy of the string "." here so that code later in the
286 * mount path doesn't care if it's the default volume or another one.
287 */
288 if (!*subvol_name) {
289 *subvol_name = kstrdup(".", GFP_KERNEL);
290 if (!*subvol_name)
291 return -ENOMEM;
292 }
293 return error;
294}
295
296static int btrfs_fill_super(struct super_block *sb,
297 struct btrfs_fs_devices *fs_devices,
298 void *data, int silent)
299{
300 struct inode *inode;
301 struct dentry *root_dentry;
302 struct btrfs_super_block *disk_super;
303 struct btrfs_root *tree_root;
304 struct btrfs_inode *bi;
305 int err;
306
307 sb->s_maxbytes = MAX_LFS_FILESIZE;
308 sb->s_magic = BTRFS_SUPER_MAGIC;
309 sb->s_op = &btrfs_super_ops;
310 sb->s_export_op = &btrfs_export_ops;
311 sb->s_xattr = btrfs_xattr_handlers;
312 sb->s_time_gran = 1;
313 sb->s_flags |= MS_POSIXACL;
314
315 tree_root = open_ctree(sb, fs_devices, (char *)data);
316
317 if (IS_ERR(tree_root)) {
318 printk("btrfs: open_ctree failed\n");
319 return PTR_ERR(tree_root);
320 }
321 sb->s_fs_info = tree_root;
322 disk_super = &tree_root->fs_info->super_copy;
323 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
324 tree_root->fs_info->fs_root);
325 bi = BTRFS_I(inode);
326 bi->location.objectid = inode->i_ino;
327 bi->location.offset = 0;
328 bi->root = tree_root->fs_info->fs_root;
329
330 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
331
332 if (!inode) {
333 err = -ENOMEM;
334 goto fail_close;
335 }
336 if (inode->i_state & I_NEW) {
337 btrfs_read_locked_inode(inode);
338 unlock_new_inode(inode);
339 }
340
341 root_dentry = d_alloc_root(inode);
342 if (!root_dentry) {
343 iput(inode);
344 err = -ENOMEM;
345 goto fail_close;
346 }
347#if 0
348 /* this does the super kobj at the same time */
349 err = btrfs_sysfs_add_super(tree_root->fs_info);
350 if (err)
351 goto fail_close;
352#endif
353
354 sb->s_root = root_dentry;
355
356 save_mount_options(sb, data);
357 return 0;
358
359fail_close:
360 close_ctree(tree_root);
361 return err;
362}
363
364int btrfs_sync_fs(struct super_block *sb, int wait)
365{
366 struct btrfs_trans_handle *trans;
367 struct btrfs_root *root;
368 int ret;
369 root = btrfs_sb(sb);
370
371 if (sb->s_flags & MS_RDONLY)
372 return 0;
373
374 sb->s_dirt = 0;
375 if (!wait) {
376 filemap_flush(root->fs_info->btree_inode->i_mapping);
377 return 0;
378 }
379
380 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0);
382
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0;
387 return ret;
388}
389
390static void btrfs_write_super(struct super_block *sb)
391{
392 sb->s_dirt = 0;
393}
394
395static int btrfs_test_super(struct super_block *s, void *data)
396{
397 struct btrfs_fs_devices *test_fs_devices = data;
398 struct btrfs_root *root = btrfs_sb(s);
399
400 return root->fs_info->fs_devices == test_fs_devices;
401}
402
403/*
404 * Find a superblock for the given device / mount point.
405 *
406 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
407 * for multiple device setup. Make sure to keep it in sync.
408 */
409static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
410 const char *dev_name, void *data, struct vfsmount *mnt)
411{
412 char *subvol_name = NULL;
413 struct block_device *bdev = NULL;
414 struct super_block *s;
415 struct dentry *root;
416 struct btrfs_fs_devices *fs_devices = NULL;
417 fmode_t mode = FMODE_READ;
418 int error = 0;
419
420 if (!(flags & MS_RDONLY))
421 mode |= FMODE_WRITE;
422
423 error = btrfs_parse_early_options(data, mode, fs_type,
424 &subvol_name, &fs_devices);
425 if (error)
426 return error;
427
428 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
429 if (error)
430 goto error_free_subvol_name;
431
432 error = btrfs_open_devices(fs_devices, mode, fs_type);
433 if (error)
434 goto error_free_subvol_name;
435
436 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
437 error = -EACCES;
438 goto error_close_devices;
439 }
440
441 bdev = fs_devices->latest_bdev;
442 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
443 if (IS_ERR(s))
444 goto error_s;
445
446 if (s->s_root) {
447 if ((flags ^ s->s_flags) & MS_RDONLY) {
448 up_write(&s->s_umount);
449 deactivate_super(s);
450 error = -EBUSY;
451 goto error_close_devices;
452 }
453
454 btrfs_close_devices(fs_devices);
455 } else {
456 char b[BDEVNAME_SIZE];
457
458 s->s_flags = flags;
459 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
460 error = btrfs_fill_super(s, fs_devices, data,
461 flags & MS_SILENT ? 1 : 0);
462 if (error) {
463 up_write(&s->s_umount);
464 deactivate_super(s);
465 goto error_free_subvol_name;
466 }
467
468 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
469 s->s_flags |= MS_ACTIVE;
470 }
471
472 if (!strcmp(subvol_name, "."))
473 root = dget(s->s_root);
474 else {
475 mutex_lock(&s->s_root->d_inode->i_mutex);
476 root = lookup_one_len(subvol_name, s->s_root,
477 strlen(subvol_name));
478 mutex_unlock(&s->s_root->d_inode->i_mutex);
479
480 if (IS_ERR(root)) {
481 up_write(&s->s_umount);
482 deactivate_super(s);
483 error = PTR_ERR(root);
484 goto error_free_subvol_name;
485 }
486 if (!root->d_inode) {
487 dput(root);
488 up_write(&s->s_umount);
489 deactivate_super(s);
490 error = -ENXIO;
491 goto error_free_subvol_name;
492 }
493 }
494
495 mnt->mnt_sb = s;
496 mnt->mnt_root = root;
497
498 kfree(subvol_name);
499 return 0;
500
501error_s:
502 error = PTR_ERR(s);
503error_close_devices:
504 btrfs_close_devices(fs_devices);
505error_free_subvol_name:
506 kfree(subvol_name);
507 return error;
508}
509
510static int btrfs_remount(struct super_block *sb, int *flags, char *data)
511{
512 struct btrfs_root *root = btrfs_sb(sb);
513 int ret;
514
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0;
517
518 if (*flags & MS_RDONLY) {
519 sb->s_flags |= MS_RDONLY;
520
521 ret = btrfs_commit_super(root);
522 WARN_ON(ret);
523 } else {
524 if (root->fs_info->fs_devices->rw_devices == 0)
525 return -EACCES;
526
527 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
528 return -EINVAL;
529
530 ret = btrfs_cleanup_reloc_trees(root);
531 WARN_ON(ret);
532
533 ret = btrfs_cleanup_fs_roots(root->fs_info);
534 WARN_ON(ret);
535
536 sb->s_flags &= ~MS_RDONLY;
537 }
538
539 return 0;
540}
541
542static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
543{
544 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
545 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
546 int bits = dentry->d_sb->s_blocksize_bits;
547 __be32 *fsid = (__be32 *)root->fs_info->fsid;
548
549 buf->f_namelen = BTRFS_NAME_LEN;
550 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
551 buf->f_bfree = buf->f_blocks -
552 (btrfs_super_bytes_used(disk_super) >> bits);
553 buf->f_bavail = buf->f_bfree;
554 buf->f_bsize = dentry->d_sb->s_blocksize;
555 buf->f_type = BTRFS_SUPER_MAGIC;
556
557 /* We treat it as constant endianness (it doesn't matter _which_)
558 because we want the fsid to come out the same whether mounted
559 on a big-endian or little-endian host */
560 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
561 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
562 /* Mask in the root object ID too, to disambiguate subvols */
563 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
564 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
565
566 return 0;
567}
568
569static struct file_system_type btrfs_fs_type = {
570 .owner = THIS_MODULE,
571 .name = "btrfs",
572 .get_sb = btrfs_get_sb,
573 .kill_sb = kill_anon_super,
574 .fs_flags = FS_REQUIRES_DEV,
575};
576
577/*
578 * used by btrfsctl to scan devices when no FS is mounted
579 */
580static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
581 unsigned long arg)
582{
583 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices;
585 int ret = 0;
586 int len;
587
588 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM;
590
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT;
594 goto out;
595 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 switch (cmd) {
598 case BTRFS_IOC_SCAN_DEV:
599 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
600 &btrfs_fs_type, &fs_devices);
601 break;
602 }
603out:
604 kfree(vol);
605 return ret;
606}
607
608static int btrfs_freeze(struct super_block *sb)
609{
610 struct btrfs_root *root = btrfs_sb(sb);
611 mutex_lock(&root->fs_info->transaction_kthread_mutex);
612 mutex_lock(&root->fs_info->cleaner_mutex);
613 return 0;
614}
615
616static int btrfs_unfreeze(struct super_block *sb)
617{
618 struct btrfs_root *root = btrfs_sb(sb);
619 mutex_unlock(&root->fs_info->cleaner_mutex);
620 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
621 return 0;
622}
623
624static struct super_operations btrfs_super_ops = {
625 .delete_inode = btrfs_delete_inode,
626 .put_super = btrfs_put_super,
627 .write_super = btrfs_write_super,
628 .sync_fs = btrfs_sync_fs,
629 .show_options = generic_show_options,
630 .write_inode = btrfs_write_inode,
631 .dirty_inode = btrfs_dirty_inode,
632 .alloc_inode = btrfs_alloc_inode,
633 .destroy_inode = btrfs_destroy_inode,
634 .statfs = btrfs_statfs,
635 .remount_fs = btrfs_remount,
636 .freeze_fs = btrfs_freeze,
637 .unfreeze_fs = btrfs_unfreeze,
638};
639
640static const struct file_operations btrfs_ctl_fops = {
641 .unlocked_ioctl = btrfs_control_ioctl,
642 .compat_ioctl = btrfs_control_ioctl,
643 .owner = THIS_MODULE,
644};
645
646static struct miscdevice btrfs_misc = {
647 .minor = MISC_DYNAMIC_MINOR,
648 .name = "btrfs-control",
649 .fops = &btrfs_ctl_fops
650};
651
652static int btrfs_interface_init(void)
653{
654 return misc_register(&btrfs_misc);
655}
656
657static void btrfs_interface_exit(void)
658{
659 if (misc_deregister(&btrfs_misc) < 0)
660 printk(KERN_INFO "misc_deregister failed for control device");
661}
662
663static int __init init_btrfs_fs(void)
664{
665 int err;
666
667 err = btrfs_init_sysfs();
668 if (err)
669 return err;
670
671 err = btrfs_init_cachep();
672 if (err)
673 goto free_sysfs;
674
675 err = extent_io_init();
676 if (err)
677 goto free_cachep;
678
679 err = extent_map_init();
680 if (err)
681 goto free_extent_io;
682
683 err = btrfs_interface_init();
684 if (err)
685 goto free_extent_map;
686
687 err = register_filesystem(&btrfs_fs_type);
688 if (err)
689 goto unregister_ioctl;
690
691 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
692 return 0;
693
694unregister_ioctl:
695 btrfs_interface_exit();
696free_extent_map:
697 extent_map_exit();
698free_extent_io:
699 extent_io_exit();
700free_cachep:
701 btrfs_destroy_cachep();
702free_sysfs:
703 btrfs_exit_sysfs();
704 return err;
705}
706
707static void __exit exit_btrfs_fs(void)
708{
709 btrfs_destroy_cachep();
710 extent_map_exit();
711 extent_io_exit();
712 btrfs_interface_exit();
713 unregister_filesystem(&btrfs_fs_type);
714 btrfs_exit_sysfs();
715 btrfs_cleanup_fs_uuids();
716 btrfs_zlib_exit();
717}
718
719module_init(init_btrfs_fs)
720module_exit(exit_btrfs_fs)
721
722MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
71 show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
91 show, store)
92
93SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
94SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
95SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
96
97static struct attribute *btrfs_super_attrs[] = {
98 &btrfs_super_attr_blocks_used.attr,
99 &btrfs_super_attr_total_blocks.attr,
100 &btrfs_super_attr_blocksize.attr,
101 NULL,
102};
103
104static ssize_t btrfs_super_attr_show(struct kobject *kobj,
105 struct attribute *attr, char *buf)
106{
107 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
108 super_kobj);
109 struct btrfs_super_attr *a = container_of(attr,
110 struct btrfs_super_attr,
111 attr);
112
113 return a->show ? a->show(fs, buf) : 0;
114}
115
116static ssize_t btrfs_super_attr_store(struct kobject *kobj,
117 struct attribute *attr,
118 const char *buf, size_t len)
119{
120 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
121 super_kobj);
122 struct btrfs_super_attr *a = container_of(attr,
123 struct btrfs_super_attr,
124 attr);
125
126 return a->store ? a->store(fs, buf, len) : 0;
127}
128
129static ssize_t btrfs_root_attr_show(struct kobject *kobj,
130 struct attribute *attr, char *buf)
131{
132 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
133 root_kobj);
134 struct btrfs_root_attr *a = container_of(attr,
135 struct btrfs_root_attr,
136 attr);
137
138 return a->show ? a->show(root, buf) : 0;
139}
140
141static ssize_t btrfs_root_attr_store(struct kobject *kobj,
142 struct attribute *attr,
143 const char *buf, size_t len)
144{
145 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
146 root_kobj);
147 struct btrfs_root_attr *a = container_of(attr,
148 struct btrfs_root_attr,
149 attr);
150 return a->store ? a->store(root, buf, len) : 0;
151}
152
153static void btrfs_super_release(struct kobject *kobj)
154{
155 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
156 super_kobj);
157 complete(&fs->kobj_unregister);
158}
159
160static void btrfs_root_release(struct kobject *kobj)
161{
162 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
163 root_kobj);
164 complete(&root->kobj_unregister);
165}
166
167static struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store,
170};
171
172static struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store,
175};
176
177static struct kobj_type btrfs_root_ktype = {
178 .default_attrs = btrfs_root_attrs,
179 .sysfs_ops = &btrfs_root_attr_ops,
180 .release = btrfs_root_release,
181};
182
183static struct kobj_type btrfs_super_ktype = {
184 .default_attrs = btrfs_super_attrs,
185 .sysfs_ops = &btrfs_super_attr_ops,
186 .release = btrfs_super_release,
187};
188
189/* /sys/fs/btrfs/ entry */
190static struct kset *btrfs_kset;
191
192int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
193{
194 int error;
195 char *name;
196 char c;
197 int len = strlen(fs->sb->s_id) + 1;
198 int i;
199
200 name = kmalloc(len, GFP_NOFS);
201 if (!name) {
202 error = -ENOMEM;
203 goto fail;
204 }
205
206 for (i = 0; i < len; i++) {
207 c = fs->sb->s_id[i];
208 if (c == '/' || c == '\\')
209 c = '!';
210 name[i] = c;
211 }
212 name[len] = '\0';
213
214 fs->super_kobj.kset = btrfs_kset;
215 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
216 NULL, "%s", name);
217 kfree(name);
218 if (error)
219 goto fail;
220
221 return 0;
222
223fail:
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void)
258{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
260 if (!btrfs_kset)
261 return -ENOMEM;
262 return 0;
263}
264
265void btrfs_exit_sysfs(void)
266{
267 kset_unregister(btrfs_kset);
268}
269
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..8a08f9443340
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include <linux/blkdev.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h"
30
31#define BTRFS_ROOT_TRANS_TAG 0
32
33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{
35 WARN_ON(transaction->use_count == 0);
36 transaction->use_count--;
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 }
42}
43
44/*
45 * either allocate a new transaction or hop into the existing one
46 */
47static noinline int join_transaction(struct btrfs_root *root)
48{
49 struct btrfs_transaction *cur_trans;
50 cur_trans = root->fs_info->running_transaction;
51 if (!cur_trans) {
52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53 GFP_NOFS);
54 BUG_ON(!cur_trans);
55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation;
61 init_waitqueue_head(&cur_trans->writer_wait);
62 init_waitqueue_head(&cur_trans->commit_wait);
63 cur_trans->in_commit = 0;
64 cur_trans->blocked = 0;
65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds();
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages,
71 root->fs_info->btree_inode->i_mapping,
72 GFP_NOFS);
73 spin_lock(&root->fs_info->new_trans_lock);
74 root->fs_info->running_transaction = cur_trans;
75 spin_unlock(&root->fs_info->new_trans_lock);
76 } else {
77 cur_trans->num_writers++;
78 cur_trans->num_joined++;
79 }
80
81 return 0;
82}
83
84/*
85 * this does all the record keeping required to make sure that a reference
86 * counted root is properly recorded in a given transaction. This is required
87 * to make sure the old root from before we joined the transaction is deleted
88 * when the transaction commits
89 */
90noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
91{
92 struct btrfs_dirty_root *dirty;
93 u64 running_trans_id = root->fs_info->running_transaction->transid;
94 if (root->ref_cows && root->last_trans < running_trans_id) {
95 WARN_ON(root == root->fs_info->extent_root);
96 if (root->root_item.refs != 0) {
97 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
98 (unsigned long)root->root_key.objectid,
99 BTRFS_ROOT_TRANS_TAG);
100
101 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
102 BUG_ON(!dirty);
103 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
104 BUG_ON(!dirty->root);
105 dirty->latest_root = root;
106 INIT_LIST_HEAD(&dirty->list);
107
108 root->commit_root = btrfs_root_node(root);
109
110 memcpy(dirty->root, root, sizeof(*root));
111 spin_lock_init(&dirty->root->node_lock);
112 spin_lock_init(&dirty->root->list_lock);
113 mutex_init(&dirty->root->objectid_mutex);
114 mutex_init(&dirty->root->log_mutex);
115 INIT_LIST_HEAD(&dirty->root->dead_list);
116 dirty->root->node = root->commit_root;
117 dirty->root->commit_root = NULL;
118
119 spin_lock(&root->list_lock);
120 list_add(&dirty->root->dead_list, &root->dead_list);
121 spin_unlock(&root->list_lock);
122
123 root->dirty_root = dirty;
124 } else {
125 WARN_ON(1);
126 }
127 root->last_trans = running_trans_id;
128 }
129 return 0;
130}
131
132/* wait for commit against the current transaction to become unblocked
133 * when this is done, it is safe to start a new transaction, but the current
134 * transaction might not be fully on disk.
135 */
136static void wait_current_trans(struct btrfs_root *root)
137{
138 struct btrfs_transaction *cur_trans;
139
140 cur_trans = root->fs_info->running_transaction;
141 if (cur_trans && cur_trans->blocked) {
142 DEFINE_WAIT(wait);
143 cur_trans->use_count++;
144 while (1) {
145 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
146 TASK_UNINTERRUPTIBLE);
147 if (cur_trans->blocked) {
148 mutex_unlock(&root->fs_info->trans_mutex);
149 schedule();
150 mutex_lock(&root->fs_info->trans_mutex);
151 finish_wait(&root->fs_info->transaction_wait,
152 &wait);
153 } else {
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 break;
157 }
158 }
159 put_transaction(cur_trans);
160 }
161}
162
163static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
164 int num_blocks, int wait)
165{
166 struct btrfs_trans_handle *h =
167 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
168 int ret;
169
170 mutex_lock(&root->fs_info->trans_mutex);
171 if (!root->fs_info->log_root_recovering &&
172 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
173 wait_current_trans(root);
174 ret = join_transaction(root);
175 BUG_ON(ret);
176
177 btrfs_record_root_in_trans(root);
178 h->transid = root->fs_info->running_transaction->transid;
179 h->transaction = root->fs_info->running_transaction;
180 h->blocks_reserved = num_blocks;
181 h->blocks_used = 0;
182 h->block_group = 0;
183 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0;
185 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex);
187 return h;
188}
189
190struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
191 int num_blocks)
192{
193 return start_transaction(root, num_blocks, 1);
194}
195struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
196 int num_blocks)
197{
198 return start_transaction(root, num_blocks, 0);
199}
200
201struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
202 int num_blocks)
203{
204 return start_transaction(r, num_blocks, 2);
205}
206
207/* wait for a transaction commit to be fully complete */
208static noinline int wait_for_commit(struct btrfs_root *root,
209 struct btrfs_transaction *commit)
210{
211 DEFINE_WAIT(wait);
212 mutex_lock(&root->fs_info->trans_mutex);
213 while (!commit->commit_done) {
214 prepare_to_wait(&commit->commit_wait, &wait,
215 TASK_UNINTERRUPTIBLE);
216 if (commit->commit_done)
217 break;
218 mutex_unlock(&root->fs_info->trans_mutex);
219 schedule();
220 mutex_lock(&root->fs_info->trans_mutex);
221 }
222 mutex_unlock(&root->fs_info->trans_mutex);
223 finish_wait(&commit->commit_wait, &wait);
224 return 0;
225}
226
227/*
228 * rate limit against the drop_snapshot code. This helps to slow down new
229 * operations if the drop_snapshot code isn't able to keep up.
230 */
231static void throttle_on_drops(struct btrfs_root *root)
232{
233 struct btrfs_fs_info *info = root->fs_info;
234 int harder_count = 0;
235
236harder:
237 if (atomic_read(&info->throttles)) {
238 DEFINE_WAIT(wait);
239 int thr;
240 thr = atomic_read(&info->throttle_gen);
241
242 do {
243 prepare_to_wait(&info->transaction_throttle,
244 &wait, TASK_UNINTERRUPTIBLE);
245 if (!atomic_read(&info->throttles)) {
246 finish_wait(&info->transaction_throttle, &wait);
247 break;
248 }
249 schedule();
250 finish_wait(&info->transaction_throttle, &wait);
251 } while (thr == atomic_read(&info->throttle_gen));
252 harder_count++;
253
254 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
255 harder_count < 2)
256 goto harder;
257
258 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
259 harder_count < 10)
260 goto harder;
261
262 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
263 harder_count < 20)
264 goto harder;
265 }
266}
267
268void btrfs_throttle(struct btrfs_root *root)
269{
270 mutex_lock(&root->fs_info->trans_mutex);
271 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root);
276}
277
278static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
279 struct btrfs_root *root, int throttle)
280{
281 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info;
283
284 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction;
286 WARN_ON(cur_trans != trans->transaction);
287 WARN_ON(cur_trans->num_writers < 1);
288 cur_trans->num_writers--;
289
290 if (waitqueue_active(&cur_trans->writer_wait))
291 wake_up(&cur_trans->writer_wait);
292 put_transaction(cur_trans);
293 mutex_unlock(&info->trans_mutex);
294 memset(trans, 0, sizeof(*trans));
295 kmem_cache_free(btrfs_trans_handle_cachep, trans);
296
297 if (throttle)
298 throttle_on_drops(root);
299
300 return 0;
301}
302
303int btrfs_end_transaction(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root)
305{
306 return __btrfs_end_transaction(trans, root, 0);
307}
308
309int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
310 struct btrfs_root *root)
311{
312 return __btrfs_end_transaction(trans, root, 1);
313}
314
315/*
316 * when btree blocks are allocated, they have some corresponding bits set for
317 * them in one of two extent_io trees. This is used to make sure all of
318 * those extents are on disk for transaction or log commit
319 */
320int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
321 struct extent_io_tree *dirty_pages)
322{
323 int ret;
324 int err = 0;
325 int werr = 0;
326 struct page *page;
327 struct inode *btree_inode = root->fs_info->btree_inode;
328 u64 start = 0;
329 u64 end;
330 unsigned long index;
331
332 while (1) {
333 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
334 EXTENT_DIRTY);
335 if (ret)
336 break;
337 while (start <= end) {
338 cond_resched();
339
340 index = start >> PAGE_CACHE_SHIFT;
341 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
342 page = find_get_page(btree_inode->i_mapping, index);
343 if (!page)
344 continue;
345
346 btree_lock_page_hook(page);
347 if (!page->mapping) {
348 unlock_page(page);
349 page_cache_release(page);
350 continue;
351 }
352
353 if (PageWriteback(page)) {
354 if (PageDirty(page))
355 wait_on_page_writeback(page);
356 else {
357 unlock_page(page);
358 page_cache_release(page);
359 continue;
360 }
361 }
362 err = write_one_page(page, 0);
363 if (err)
364 werr = err;
365 page_cache_release(page);
366 }
367 }
368 while (1) {
369 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
370 EXTENT_DIRTY);
371 if (ret)
372 break;
373
374 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
375 while (start <= end) {
376 index = start >> PAGE_CACHE_SHIFT;
377 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
378 page = find_get_page(btree_inode->i_mapping, index);
379 if (!page)
380 continue;
381 if (PageDirty(page)) {
382 btree_lock_page_hook(page);
383 wait_on_page_writeback(page);
384 err = write_one_page(page, 0);
385 if (err)
386 werr = err;
387 }
388 wait_on_page_writeback(page);
389 page_cache_release(page);
390 cond_resched();
391 }
392 }
393 if (err)
394 werr = err;
395 return werr;
396}
397
398int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
399 struct btrfs_root *root)
400{
401 if (!trans || !trans->transaction) {
402 struct inode *btree_inode;
403 btree_inode = root->fs_info->btree_inode;
404 return filemap_write_and_wait(btree_inode->i_mapping);
405 }
406 return btrfs_write_and_wait_marked_extents(root,
407 &trans->transaction->dirty_pages);
408}
409
410/*
411 * this is used to update the root pointer in the tree of tree roots.
412 *
413 * But, in the case of the extent allocation tree, updating the root
414 * pointer may allocate blocks which may change the root of the extent
415 * allocation tree.
416 *
417 * So, this loops and repeats and makes sure the cowonly root didn't
418 * change while the root pointer was being updated in the metadata.
419 */
420static int update_cowonly_root(struct btrfs_trans_handle *trans,
421 struct btrfs_root *root)
422{
423 int ret;
424 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root;
426
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root);
430
431 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
433 if (old_root_bytenr == root->node->start)
434 break;
435 btrfs_set_root_bytenr(&root->root_item,
436 root->node->start);
437 btrfs_set_root_level(&root->root_item,
438 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid);
440
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key,
445 &root->root_item);
446 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root);
449 }
450 return 0;
451}
452
453/*
454 * update all the cowonly tree roots on disk
455 */
456int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
457 struct btrfs_root *root)
458{
459 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next;
461 struct extent_buffer *eb;
462
463 btrfs_extent_post_op(trans, fs_info->tree_root);
464
465 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
467 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb);
469
470 btrfs_extent_post_op(trans, fs_info->tree_root);
471
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next;
474 list_del_init(next);
475 root = list_entry(next, struct btrfs_root, dirty_list);
476
477 update_cowonly_root(trans, root);
478 }
479 return 0;
480}
481
482/*
483 * dead roots are old snapshots that need to be deleted. This allocates
484 * a dirty root struct and adds it into the list of dead roots that need to
485 * be deleted
486 */
487int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
488{
489 struct btrfs_dirty_root *dirty;
490
491 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
492 if (!dirty)
493 return -ENOMEM;
494 dirty->root = root;
495 dirty->latest_root = latest;
496
497 mutex_lock(&root->fs_info->trans_mutex);
498 list_add(&dirty->list, &latest->fs_info->dead_roots);
499 mutex_unlock(&root->fs_info->trans_mutex);
500 return 0;
501}
502
503/*
504 * at transaction commit time we need to schedule the old roots for
505 * deletion via btrfs_drop_snapshot. This runs through all the
506 * reference counted roots that were modified in the current
507 * transaction and puts them into the drop list
508 */
509static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
510 struct radix_tree_root *radix,
511 struct list_head *list)
512{
513 struct btrfs_dirty_root *dirty;
514 struct btrfs_root *gang[8];
515 struct btrfs_root *root;
516 int i;
517 int ret;
518 int err = 0;
519 u32 refs;
520
521 while (1) {
522 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
523 ARRAY_SIZE(gang),
524 BTRFS_ROOT_TRANS_TAG);
525 if (ret == 0)
526 break;
527 for (i = 0; i < ret; i++) {
528 root = gang[i];
529 radix_tree_tag_clear(radix,
530 (unsigned long)root->root_key.objectid,
531 BTRFS_ROOT_TRANS_TAG);
532
533 BUG_ON(!root->ref_tree);
534 dirty = root->dirty_root;
535
536 btrfs_free_log(trans, root);
537 btrfs_free_reloc_root(trans, root);
538
539 if (root->commit_root == root->node) {
540 WARN_ON(root->node->start !=
541 btrfs_root_bytenr(&root->root_item));
542
543 free_extent_buffer(root->commit_root);
544 root->commit_root = NULL;
545 root->dirty_root = NULL;
546
547 spin_lock(&root->list_lock);
548 list_del_init(&dirty->root->dead_list);
549 spin_unlock(&root->list_lock);
550
551 kfree(dirty->root);
552 kfree(dirty);
553
554 /* make sure to update the root on disk
555 * so we get any updates to the block used
556 * counts
557 */
558 err = btrfs_update_root(trans,
559 root->fs_info->tree_root,
560 &root->root_key,
561 &root->root_item);
562 continue;
563 }
564
565 memset(&root->root_item.drop_progress, 0,
566 sizeof(struct btrfs_disk_key));
567 root->root_item.drop_level = 0;
568 root->commit_root = NULL;
569 root->dirty_root = NULL;
570 root->root_key.offset = root->fs_info->generation;
571 btrfs_set_root_bytenr(&root->root_item,
572 root->node->start);
573 btrfs_set_root_level(&root->root_item,
574 btrfs_header_level(root->node));
575 btrfs_set_root_generation(&root->root_item,
576 root->root_key.offset);
577
578 err = btrfs_insert_root(trans, root->fs_info->tree_root,
579 &root->root_key,
580 &root->root_item);
581 if (err)
582 break;
583
584 refs = btrfs_root_refs(&dirty->root->root_item);
585 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
586 err = btrfs_update_root(trans, root->fs_info->tree_root,
587 &dirty->root->root_key,
588 &dirty->root->root_item);
589
590 BUG_ON(err);
591 if (refs == 1) {
592 list_add(&dirty->list, list);
593 } else {
594 WARN_ON(1);
595 free_extent_buffer(dirty->root->node);
596 kfree(dirty->root);
597 kfree(dirty);
598 }
599 }
600 }
601 return err;
602}
603
604/*
605 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
606 * otherwise every leaf in the btree is read and defragged.
607 */
608int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
609{
610 struct btrfs_fs_info *info = root->fs_info;
611 int ret;
612 struct btrfs_trans_handle *trans;
613 unsigned long nr;
614
615 smp_mb();
616 if (root->defrag_running)
617 return 0;
618 trans = btrfs_start_transaction(root, 1);
619 while (1) {
620 root->defrag_running = 1;
621 ret = btrfs_defrag_leaves(trans, root, cacheonly);
622 nr = trans->blocks_used;
623 btrfs_end_transaction(trans, root);
624 btrfs_btree_balance_dirty(info->tree_root, nr);
625 cond_resched();
626
627 trans = btrfs_start_transaction(root, 1);
628 if (root->fs_info->closing || ret != -EAGAIN)
629 break;
630 }
631 root->defrag_running = 0;
632 smp_mb();
633 btrfs_end_transaction(trans, root);
634 return 0;
635}
636
637/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them
640 */
641static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
642 struct list_head *list)
643{
644 struct btrfs_dirty_root *dirty;
645 struct btrfs_trans_handle *trans;
646 unsigned long nr;
647 u64 num_bytes;
648 u64 bytes_used;
649 u64 max_useless;
650 int ret = 0;
651 int err;
652
653 while (!list_empty(list)) {
654 struct btrfs_root *root;
655
656 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
657 list_del_init(&dirty->list);
658
659 num_bytes = btrfs_root_used(&dirty->root->root_item);
660 root = dirty->latest_root;
661 atomic_inc(&root->fs_info->throttles);
662
663 while (1) {
664 trans = btrfs_start_transaction(tree_root, 1);
665 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN)
668 break;
669 mutex_unlock(&root->fs_info->drop_mutex);
670
671 err = btrfs_update_root(trans,
672 tree_root,
673 &dirty->root->root_key,
674 &dirty->root->root_item);
675 if (err)
676 ret = err;
677 nr = trans->blocks_used;
678 ret = btrfs_end_transaction(trans, tree_root);
679 BUG_ON(ret);
680
681 btrfs_btree_balance_dirty(tree_root, nr);
682 cond_resched();
683 }
684 BUG_ON(ret);
685 atomic_dec(&root->fs_info->throttles);
686 wake_up(&root->fs_info->transaction_throttle);
687
688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) {
691 btrfs_record_root_in_trans(root);
692 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes);
694 }
695
696 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
697 if (ret) {
698 BUG();
699 break;
700 }
701 mutex_unlock(&root->fs_info->drop_mutex);
702
703 spin_lock(&root->list_lock);
704 list_del_init(&dirty->root->dead_list);
705 if (!list_empty(&root->dead_list)) {
706 struct btrfs_root *oldest;
707 oldest = list_entry(root->dead_list.prev,
708 struct btrfs_root, dead_list);
709 max_useless = oldest->root_key.offset - 1;
710 } else {
711 max_useless = root->root_key.offset - 1;
712 }
713 spin_unlock(&root->list_lock);
714
715 nr = trans->blocks_used;
716 ret = btrfs_end_transaction(trans, tree_root);
717 BUG_ON(ret);
718
719 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
720 BUG_ON(ret);
721
722 free_extent_buffer(dirty->root->node);
723 kfree(dirty->root);
724 kfree(dirty);
725
726 btrfs_btree_balance_dirty(tree_root, nr);
727 cond_resched();
728 }
729 return ret;
730}
731
732/*
733 * new snapshots need to be created at a very specific time in the
734 * transaction commit. This does the actual creation
735 */
736static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
737 struct btrfs_fs_info *fs_info,
738 struct btrfs_pending_snapshot *pending)
739{
740 struct btrfs_key key;
741 struct btrfs_root_item *new_root_item;
742 struct btrfs_root *tree_root = fs_info->tree_root;
743 struct btrfs_root *root = pending->root;
744 struct extent_buffer *tmp;
745 struct extent_buffer *old;
746 int ret;
747 u64 objectid;
748
749 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
750 if (!new_root_item) {
751 ret = -ENOMEM;
752 goto fail;
753 }
754 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
755 if (ret)
756 goto fail;
757
758 btrfs_record_root_in_trans(root);
759 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
760 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
761
762 key.objectid = objectid;
763 key.offset = trans->transid;
764 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
765
766 old = btrfs_lock_root_node(root);
767 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
768
769 btrfs_copy_root(trans, root, old, &tmp, objectid);
770 btrfs_tree_unlock(old);
771 free_extent_buffer(old);
772
773 btrfs_set_root_bytenr(new_root_item, tmp->start);
774 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
775 btrfs_set_root_generation(new_root_item, trans->transid);
776 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
777 new_root_item);
778 btrfs_tree_unlock(tmp);
779 free_extent_buffer(tmp);
780 if (ret)
781 goto fail;
782
783 key.offset = (u64)-1;
784 memcpy(&pending->root_key, &key, sizeof(key));
785fail:
786 kfree(new_root_item);
787 return ret;
788}
789
790static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
791 struct btrfs_pending_snapshot *pending)
792{
793 int ret;
794 int namelen;
795 u64 index = 0;
796 struct btrfs_trans_handle *trans;
797 struct inode *parent_inode;
798 struct inode *inode;
799 struct btrfs_root *parent_root;
800
801 parent_inode = pending->dentry->d_parent->d_inode;
802 parent_root = BTRFS_I(parent_inode)->root;
803 trans = btrfs_join_transaction(parent_root, 1);
804
805 /*
806 * insert the directory item
807 */
808 namelen = strlen(pending->name);
809 ret = btrfs_set_inode_index(parent_inode, &index);
810 ret = btrfs_insert_dir_item(trans, parent_root,
811 pending->name, namelen,
812 parent_inode->i_ino,
813 &pending->root_key, BTRFS_FT_DIR, index);
814
815 if (ret)
816 goto fail;
817
818 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
819 ret = btrfs_update_inode(trans, parent_root, parent_inode);
820 BUG_ON(ret);
821
822 /* add the backref first */
823 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
824 pending->root_key.objectid,
825 BTRFS_ROOT_BACKREF_KEY,
826 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name,
828 namelen);
829
830 BUG_ON(ret);
831
832 /* now add the forward ref */
833 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
834 parent_root->root_key.objectid,
835 BTRFS_ROOT_REF_KEY,
836 pending->root_key.objectid,
837 parent_inode->i_ino, index, pending->name,
838 namelen);
839
840 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
841 d_instantiate(pending->dentry, inode);
842fail:
843 btrfs_end_transaction(trans, fs_info->fs_root);
844 return ret;
845}
846
847/*
848 * create all the snapshots we've scheduled for creation
849 */
850static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
851 struct btrfs_fs_info *fs_info)
852{
853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret;
857
858 list_for_each(cur, head) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret);
862 }
863 return 0;
864}
865
866static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
867 struct btrfs_fs_info *fs_info)
868{
869 struct btrfs_pending_snapshot *pending;
870 struct list_head *head = &trans->transaction->pending_snapshots;
871 int ret;
872
873 while (!list_empty(head)) {
874 pending = list_entry(head->next,
875 struct btrfs_pending_snapshot, list);
876 ret = finish_pending_snapshot(fs_info, pending);
877 BUG_ON(ret);
878 list_del(&pending->list);
879 kfree(pending->name);
880 kfree(pending);
881 }
882 return 0;
883}
884
885int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
886 struct btrfs_root *root)
887{
888 unsigned long joined = 0;
889 unsigned long timeout = 1;
890 struct btrfs_transaction *cur_trans;
891 struct btrfs_transaction *prev_trans = NULL;
892 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
893 struct list_head dirty_fs_roots;
894 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait);
896 int ret;
897
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) {
901 cur_trans = trans->transaction;
902 trans->transaction->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root);
905
906 ret = wait_for_commit(root, cur_trans);
907 BUG_ON(ret);
908
909 mutex_lock(&root->fs_info->trans_mutex);
910 put_transaction(cur_trans);
911 mutex_unlock(&root->fs_info->trans_mutex);
912
913 return 0;
914 }
915
916 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
917 if (!pinned_copy)
918 return -ENOMEM;
919
920 extent_io_tree_init(pinned_copy,
921 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
922
923 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list);
929 if (!prev_trans->commit_done) {
930 prev_trans->use_count++;
931 mutex_unlock(&root->fs_info->trans_mutex);
932
933 wait_for_commit(root, prev_trans);
934
935 mutex_lock(&root->fs_info->trans_mutex);
936 put_transaction(prev_trans);
937 }
938 }
939
940 do {
941 int snap_pending = 0;
942 joined = cur_trans->num_joined;
943 if (!list_empty(&trans->transaction->pending_snapshots))
944 snap_pending = 1;
945
946 WARN_ON(cur_trans != trans->transaction);
947 prepare_to_wait(&cur_trans->writer_wait, &wait,
948 TASK_UNINTERRUPTIBLE);
949
950 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT;
952 else
953 timeout = 1;
954
955 mutex_unlock(&root->fs_info->trans_mutex);
956
957 if (snap_pending) {
958 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret);
960 }
961
962 schedule_timeout(timeout);
963
964 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined));
968
969 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret);
971
972 WARN_ON(cur_trans != trans->transaction);
973
974 /* btrfs_commit_tree_roots is responsible for getting the
975 * various roots consistent with each other. Every pointer
976 * in the tree of tree roots has to point to the most up to date
977 * root for every subvolume and other tree. So, we have to keep
978 * the tree logging code from jumping in and changing any
979 * of the trees.
980 *
981 * At this point in the commit, there can't be any tree-log
982 * writers, but a little lower down we drop the trans mutex
983 * and let new people in. By holding the tree_log_mutex
984 * from now until after the super is written, we avoid races
985 * with the tree-log code.
986 */
987 mutex_lock(&root->fs_info->tree_log_mutex);
988 /*
989 * keep tree reloc code from adding new reloc trees
990 */
991 mutex_lock(&root->fs_info->tree_reloc_mutex);
992
993
994 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
995 &dirty_fs_roots);
996 BUG_ON(ret);
997
998 /* add_dirty_roots gets rid of all the tree log roots, it is now
999 * safe to free the root of tree log roots
1000 */
1001 btrfs_free_log_root_tree(trans, root->fs_info);
1002
1003 ret = btrfs_commit_tree_roots(trans, root);
1004 BUG_ON(ret);
1005
1006 cur_trans = root->fs_info->running_transaction;
1007 spin_lock(&root->fs_info->new_trans_lock);
1008 root->fs_info->running_transaction = NULL;
1009 spin_unlock(&root->fs_info->new_trans_lock);
1010 btrfs_set_super_generation(&root->fs_info->super_copy,
1011 cur_trans->transid);
1012 btrfs_set_super_root(&root->fs_info->super_copy,
1013 root->fs_info->tree_root->node->start);
1014 btrfs_set_super_root_level(&root->fs_info->super_copy,
1015 btrfs_header_level(root->fs_info->tree_root->node));
1016
1017 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1018 chunk_root->node->start);
1019 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1020 btrfs_header_level(chunk_root->node));
1021 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1022 btrfs_header_generation(chunk_root->node));
1023
1024 if (!root->fs_info->log_root_recovering) {
1025 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1026 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1027 }
1028
1029 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1030 sizeof(root->fs_info->super_copy));
1031
1032 btrfs_copy_pinned(root, pinned_copy);
1033
1034 trans->transaction->blocked = 0;
1035 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait);
1037
1038 mutex_unlock(&root->fs_info->trans_mutex);
1039 ret = btrfs_write_and_wait_transaction(trans, root);
1040 BUG_ON(ret);
1041 write_ctree_super(trans, root, 0);
1042
1043 /*
1044 * the super is written, we can safely allow the tree-loggers
1045 * to go about their business
1046 */
1047 mutex_unlock(&root->fs_info->tree_log_mutex);
1048
1049 btrfs_finish_extent_commit(trans, root, pinned_copy);
1050 kfree(pinned_copy);
1051
1052 btrfs_drop_dead_reloc_roots(root);
1053 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1054
1055 /* do the directory inserts of any pending snapshot creations */
1056 finish_pending_snapshots(trans, root->fs_info);
1057
1058 mutex_lock(&root->fs_info->trans_mutex);
1059
1060 cur_trans->commit_done = 1;
1061 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait);
1063
1064 put_transaction(cur_trans);
1065 put_transaction(cur_trans);
1066
1067 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1068 if (root->fs_info->closing)
1069 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1070
1071 mutex_unlock(&root->fs_info->trans_mutex);
1072
1073 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1074
1075 if (root->fs_info->closing)
1076 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1077 return ret;
1078}
1079
1080/*
1081 * interface function to delete all the snapshots we have scheduled for deletion
1082 */
1083int btrfs_clean_old_snapshots(struct btrfs_root *root)
1084{
1085 struct list_head dirty_roots;
1086 INIT_LIST_HEAD(&dirty_roots);
1087again:
1088 mutex_lock(&root->fs_info->trans_mutex);
1089 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1090 mutex_unlock(&root->fs_info->trans_mutex);
1091
1092 if (!list_empty(&dirty_roots)) {
1093 drop_dirty_roots(root, &dirty_roots);
1094 goto again;
1095 }
1096 return 0;
1097}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 u64 block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct dentry *dentry;
51 struct btrfs_root *root;
52 char *name;
53 struct btrfs_key root_key;
54 struct list_head list;
55};
56
57struct btrfs_dirty_root {
58 struct list_head list;
59 struct btrfs_root *root;
60 struct btrfs_root *latest_root;
61};
62
63static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
64 struct inode *inode)
65{
66 trans->block_group = BTRFS_I(inode)->block_group;
67}
68
69static inline void btrfs_update_inode_block_group(
70 struct btrfs_trans_handle *trans,
71 struct inode *inode)
72{
73 BTRFS_I(inode)->block_group = trans->block_group;
74}
75
76static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->last_trans = trans->transaction->transid;
80}
81
82int btrfs_end_transaction(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root);
84struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
87 int num_blocks);
88struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
89 int num_blocks);
90int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
93 struct btrfs_root *root);
94
95int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
96int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
97int btrfs_clean_old_snapshots(struct btrfs_root *root);
98int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root);
102void btrfs_throttle(struct btrfs_root *root);
103int btrfs_record_root_in_trans(struct btrfs_root *root);
104int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
105 struct extent_io_tree *dirty_pages);
106#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read
27 * things from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only)
33{
34 struct btrfs_path *path = NULL;
35 struct btrfs_key key;
36 int ret = 0;
37 int wret;
38 int level;
39 int orig_level;
40 int is_extent = 0;
41 int next_key_ret = 0;
42 u64 last_ret = 0;
43 u64 min_trans = 0;
44
45 if (cache_only)
46 goto out;
47
48 if (root->fs_info->extent_root == root) {
49 /*
50 * there's recursion here right now in the tree locking,
51 * we can't defrag the extent root without deadlock
52 */
53 goto out;
54 }
55
56 if (root->ref_cows == 0 && !is_extent)
57 goto out;
58
59 if (btrfs_test_opt(root, SSD))
60 goto out;
61
62 path = btrfs_alloc_path();
63 if (!path)
64 return -ENOMEM;
65
66 level = btrfs_header_level(root->node);
67 orig_level = level;
68
69 if (level == 0)
70 goto out;
71
72 if (root->defrag_progress.objectid == 0) {
73 struct extent_buffer *root_node;
74 u32 nritems;
75
76 root_node = btrfs_lock_root_node(root);
77 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */
80 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
81 nritems - 1);
82 btrfs_tree_unlock(root_node);
83 free_extent_buffer(root_node);
84 memset(&key, 0, sizeof(key));
85 } else {
86 memcpy(&key, &root->defrag_progress, sizeof(key));
87 }
88
89 path->keep_locks = 1;
90 if (cache_only)
91 min_trans = root->defrag_trans_start;
92
93 ret = btrfs_search_forward(root, &key, NULL, path,
94 cache_only, min_trans);
95 if (ret < 0)
96 goto out;
97 if (ret > 0) {
98 ret = 0;
99 goto out;
100 }
101 btrfs_release_path(root, path);
102 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
103
104 if (wret < 0) {
105 ret = wret;
106 goto out;
107 }
108 if (!path->nodes[1]) {
109 ret = 0;
110 goto out;
111 }
112 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
113 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
114 min_trans);
115 ret = btrfs_realloc_node(trans, root,
116 path->nodes[1], 0,
117 cache_only, &last_ret,
118 &root->defrag_progress);
119 WARN_ON(ret && ret != -EAGAIN);
120 if (next_key_ret == 0) {
121 memcpy(&root->defrag_progress, &key, sizeof(key));
122 ret = -EAGAIN;
123 }
124
125 btrfs_release_path(root, path);
126 if (is_extent)
127 btrfs_extent_post_op(trans, root);
128out:
129 if (path)
130 btrfs_free_path(path);
131 if (ret == -EAGAIN) {
132 if (root->defrag_max.objectid > root->defrag_progress.objectid)
133 goto done;
134 if (root->defrag_max.type > root->defrag_progress.type)
135 goto done;
136 if (root->defrag_max.offset > root->defrag_progress.offset)
137 goto done;
138 ret = 0;
139 }
140done:
141 if (ret != -EAGAIN) {
142 memset(&root->defrag_progress, 0,
143 sizeof(root->defrag_progress));
144 root->defrag_trans_start = trans->transid;
145 }
146 return ret;
147}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d81cda2e077c
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26#include "tree-log.h"
27
28/* magic values for the inode_only field in btrfs_log_inode:
29 *
30 * LOG_INODE_ALL means to log everything
31 * LOG_INODE_EXISTS means to log just enough to recreate the inode
32 * during log replay
33 */
34#define LOG_INODE_ALL 0
35#define LOG_INODE_EXISTS 1
36
37/*
38 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes
41 * we find in the log are created in the subvolume.
42 *
43 * The last stage is to deal with directories and links and extents
44 * and all the other fun semantics
45 */
46#define LOG_WALK_PIN_ONLY 0
47#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2
49
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode,
52 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid);
56
57/*
58 * tree logging is a special write ahead log used to make sure that
59 * fsyncs and O_SYNCs can happen without doing full tree commits.
60 *
61 * Full tree commits are expensive because they require commonly
62 * modified blocks to be recowed, creating many dirty pages in the
63 * extent tree an 4x-6x higher write load than ext3.
64 *
65 * Instead of doing a tree commit on every fsync, we use the
66 * key ranges and transaction ids to find items for a given file or directory
67 * that have changed in this transaction. Those items are copied into
68 * a special tree (one per subvolume root), that tree is written to disk
69 * and then the fsync is considered complete.
70 *
71 * After a crash, items are copied out of the log-tree back into the
72 * subvolume tree. Any file data extents found are recorded in the extent
73 * allocation tree, and the log-tree freed.
74 *
75 * The log tree is read three times, once to pin down all the extents it is
76 * using in ram and once, once to create all the inodes logged in the tree
77 * and once to do all the other items.
78 */
79
80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish
182 */
183static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root)
185{
186 int ret;
187 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info);
190 BUG_ON(ret);
191 }
192 if (!root->log_root) {
193 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret);
195 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex);
199 return 0;
200}
201
202/*
203 * returns 0 if there was a log transaction running and we were able
204 * to join, or returns -ENOENT if there were not transactions
205 * in progress
206 */
207static int join_running_log_trans(struct btrfs_root *root)
208{
209 int ret = -ENOENT;
210
211 smp_mb();
212 if (!root->log_root)
213 return -ENOENT;
214
215 mutex_lock(&root->fs_info->tree_log_mutex);
216 if (root->log_root) {
217 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers);
219 root->fs_info->tree_log_batch++;
220 }
221 mutex_unlock(&root->fs_info->tree_log_mutex);
222 return ret;
223}
224
225/*
226 * indicate we're done making changes to the log tree
227 * and wake up anyone waiting to do a sync
228 */
229static int end_log_trans(struct btrfs_root *root)
230{
231 atomic_dec(&root->fs_info->tree_log_writers);
232 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait))
234 wake_up(&root->fs_info->tree_log_wait);
235 return 0;
236}
237
238
239/*
240 * the walk control struct is used to pass state down the chain when
241 * processing the log tree. The stage field tells us which part
242 * of the log tree processing we are currently doing. The others
243 * are state fields used for that specific part
244 */
245struct walk_control {
246 /* should we free the extent on disk when done? This is used
247 * at transaction commit time while freeing a log tree
248 */
249 int free;
250
251 /* should we write out the extent buffer? This is used
252 * while flushing the log tree to disk during a sync
253 */
254 int write;
255
256 /* should we wait for the extent buffer io to finish? Also used
257 * while flushing the log tree to disk for a sync
258 */
259 int wait;
260
261 /* pin only walk, we record which extents on disk belong to the
262 * log trees
263 */
264 int pin;
265
266 /* what stage of the replay code we're currently in */
267 int stage;
268
269 /* the root we are currently replaying */
270 struct btrfs_root *replay_dest;
271
272 /* the trans handle for the current replay */
273 struct btrfs_trans_handle *trans;
274
275 /* the function that gets used to process blocks we find in the
276 * tree. Note the extent_buffer might not be up to date when it is
277 * passed in, and it must be checked or read if you need the data
278 * inside it
279 */
280 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
281 struct walk_control *wc, u64 gen);
282};
283
284/*
285 * process_func used to pin down extents, write them or wait on them
286 */
287static int process_one_buffer(struct btrfs_root *log,
288 struct extent_buffer *eb,
289 struct walk_control *wc, u64 gen)
290{
291 if (wc->pin) {
292 mutex_lock(&log->fs_info->pinned_mutex);
293 btrfs_update_pinned_extents(log->fs_info->extent_root,
294 eb->start, eb->len, 1);
295 mutex_unlock(&log->fs_info->pinned_mutex);
296 }
297
298 if (btrfs_buffer_uptodate(eb, gen)) {
299 if (wc->write)
300 btrfs_write_tree_block(eb);
301 if (wc->wait)
302 btrfs_wait_tree_block_writeback(eb);
303 }
304 return 0;
305}
306
307/*
308 * Item overwrite used by replay and tree logging. eb, slot and key all refer
309 * to the src data we are copying out.
310 *
311 * root is the tree we are copying into, and path is a scratch
312 * path for use in this function (it should be released on entry and
313 * will be released on exit).
314 *
315 * If the key is already in the destination tree the existing item is
316 * overwritten. If the existing item isn't big enough, it is extended.
317 * If it is too large, it is truncated.
318 *
319 * If the key isn't in the destination yet, a new item is inserted.
320 */
321static noinline int overwrite_item(struct btrfs_trans_handle *trans,
322 struct btrfs_root *root,
323 struct btrfs_path *path,
324 struct extent_buffer *eb, int slot,
325 struct btrfs_key *key)
326{
327 int ret;
328 u32 item_size;
329 u64 saved_i_size = 0;
330 int save_old_i_size = 0;
331 unsigned long src_ptr;
332 unsigned long dst_ptr;
333 int overwrite_root = 0;
334
335 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
336 overwrite_root = 1;
337
338 item_size = btrfs_item_size_nr(eb, slot);
339 src_ptr = btrfs_item_ptr_offset(eb, slot);
340
341 /* look for the key in the destination tree */
342 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
343 if (ret == 0) {
344 char *src_copy;
345 char *dst_copy;
346 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
347 path->slots[0]);
348 if (dst_size != item_size)
349 goto insert;
350
351 if (item_size == 0) {
352 btrfs_release_path(root, path);
353 return 0;
354 }
355 dst_copy = kmalloc(item_size, GFP_NOFS);
356 src_copy = kmalloc(item_size, GFP_NOFS);
357
358 read_extent_buffer(eb, src_copy, src_ptr, item_size);
359
360 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
361 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
362 item_size);
363 ret = memcmp(dst_copy, src_copy, item_size);
364
365 kfree(dst_copy);
366 kfree(src_copy);
367 /*
368 * they have the same contents, just return, this saves
369 * us from cowing blocks in the destination tree and doing
370 * extra writes that may not have been done by a previous
371 * sync
372 */
373 if (ret == 0) {
374 btrfs_release_path(root, path);
375 return 0;
376 }
377
378 }
379insert:
380 btrfs_release_path(root, path);
381 /* try to insert the key into the destination tree */
382 ret = btrfs_insert_empty_item(trans, root, path,
383 key, item_size);
384
385 /* make sure any existing item is the correct size */
386 if (ret == -EEXIST) {
387 u32 found_size;
388 found_size = btrfs_item_size_nr(path->nodes[0],
389 path->slots[0]);
390 if (found_size > item_size) {
391 btrfs_truncate_item(trans, root, path, item_size, 1);
392 } else if (found_size < item_size) {
393 ret = btrfs_extend_item(trans, root, path,
394 item_size - found_size);
395 BUG_ON(ret);
396 }
397 } else if (ret) {
398 BUG();
399 }
400 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
401 path->slots[0]);
402
403 /* don't overwrite an existing inode if the generation number
404 * was logged as zero. This is done when the tree logging code
405 * is just logging an inode to make sure it exists after recovery.
406 *
407 * Also, don't overwrite i_size on directories during replay.
408 * log replay inserts and removes directory items based on the
409 * state of the tree found in the subvolume, and i_size is modified
410 * as it goes
411 */
412 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
413 struct btrfs_inode_item *src_item;
414 struct btrfs_inode_item *dst_item;
415
416 src_item = (struct btrfs_inode_item *)src_ptr;
417 dst_item = (struct btrfs_inode_item *)dst_ptr;
418
419 if (btrfs_inode_generation(eb, src_item) == 0)
420 goto no_copy;
421
422 if (overwrite_root &&
423 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
424 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
425 save_old_i_size = 1;
426 saved_i_size = btrfs_inode_size(path->nodes[0],
427 dst_item);
428 }
429 }
430
431 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
432 src_ptr, item_size);
433
434 if (save_old_i_size) {
435 struct btrfs_inode_item *dst_item;
436 dst_item = (struct btrfs_inode_item *)dst_ptr;
437 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
438 }
439
440 /* make sure the generation is filled in */
441 if (key->type == BTRFS_INODE_ITEM_KEY) {
442 struct btrfs_inode_item *dst_item;
443 dst_item = (struct btrfs_inode_item *)dst_ptr;
444 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
445 btrfs_set_inode_generation(path->nodes[0], dst_item,
446 trans->transid);
447 }
448 }
449no_copy:
450 btrfs_mark_buffer_dirty(path->nodes[0]);
451 btrfs_release_path(root, path);
452 return 0;
453}
454
455/*
456 * simple helper to read an inode off the disk from a given root
457 * This can only be called for subvolume roots and not for the log
458 */
459static noinline struct inode *read_one_inode(struct btrfs_root *root,
460 u64 objectid)
461{
462 struct inode *inode;
463 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
464 if (inode->i_state & I_NEW) {
465 BTRFS_I(inode)->root = root;
466 BTRFS_I(inode)->location.objectid = objectid;
467 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
468 BTRFS_I(inode)->location.offset = 0;
469 btrfs_read_locked_inode(inode);
470 unlock_new_inode(inode);
471
472 }
473 if (is_bad_inode(inode)) {
474 iput(inode);
475 inode = NULL;
476 }
477 return inode;
478}
479
480/* replays a single extent in 'eb' at 'slot' with 'key' into the
481 * subvolume 'root'. path is released on entry and should be released
482 * on exit.
483 *
484 * extents in the log tree have not been allocated out of the extent
485 * tree yet. So, this completes the allocation, taking a reference
486 * as required if the extent already exists or creating a new extent
487 * if it isn't in the extent allocation tree yet.
488 *
489 * The extent is inserted into the file, dropping any existing extents
490 * from the file that overlap the new one.
491 */
492static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
493 struct btrfs_root *root,
494 struct btrfs_path *path,
495 struct extent_buffer *eb, int slot,
496 struct btrfs_key *key)
497{
498 int found_type;
499 u64 mask = root->sectorsize - 1;
500 u64 extent_end;
501 u64 alloc_hint;
502 u64 start = key->offset;
503 u64 saved_nbytes;
504 struct btrfs_file_extent_item *item;
505 struct inode *inode = NULL;
506 unsigned long size;
507 int ret = 0;
508
509 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
510 found_type = btrfs_file_extent_type(eb, item);
511
512 if (found_type == BTRFS_FILE_EXTENT_REG ||
513 found_type == BTRFS_FILE_EXTENT_PREALLOC)
514 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
515 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
516 size = btrfs_file_extent_inline_len(eb, item);
517 extent_end = (start + size + mask) & ~mask;
518 } else {
519 ret = 0;
520 goto out;
521 }
522
523 inode = read_one_inode(root, key->objectid);
524 if (!inode) {
525 ret = -EIO;
526 goto out;
527 }
528
529 /*
530 * first check to see if we already have this extent in the
531 * file. This must be done before the btrfs_drop_extents run
532 * so we don't try to drop this extent.
533 */
534 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
535 start, 0);
536
537 if (ret == 0 &&
538 (found_type == BTRFS_FILE_EXTENT_REG ||
539 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
540 struct btrfs_file_extent_item cmp1;
541 struct btrfs_file_extent_item cmp2;
542 struct btrfs_file_extent_item *existing;
543 struct extent_buffer *leaf;
544
545 leaf = path->nodes[0];
546 existing = btrfs_item_ptr(leaf, path->slots[0],
547 struct btrfs_file_extent_item);
548
549 read_extent_buffer(eb, &cmp1, (unsigned long)item,
550 sizeof(cmp1));
551 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
552 sizeof(cmp2));
553
554 /*
555 * we already have a pointer to this exact extent,
556 * we don't have to do anything
557 */
558 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
559 btrfs_release_path(root, path);
560 goto out;
561 }
562 }
563 btrfs_release_path(root, path);
564
565 saved_nbytes = inode_get_bytes(inode);
566 /* drop any overlapping extents */
567 ret = btrfs_drop_extents(trans, root, inode,
568 start, extent_end, start, &alloc_hint);
569 BUG_ON(ret);
570
571 if (found_type == BTRFS_FILE_EXTENT_REG ||
572 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
573 unsigned long dest_offset;
574 struct btrfs_key ins;
575
576 ret = btrfs_insert_empty_item(trans, root, path, key,
577 sizeof(*item));
578 BUG_ON(ret);
579 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
580 path->slots[0]);
581 copy_extent_buffer(path->nodes[0], eb, dest_offset,
582 (unsigned long)item, sizeof(*item));
583
584 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
585 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
586 ins.type = BTRFS_EXTENT_ITEM_KEY;
587
588 if (ins.objectid > 0) {
589 u64 csum_start;
590 u64 csum_end;
591 LIST_HEAD(ordered_sums);
592 /*
593 * is this extent already allocated in the extent
594 * allocation tree? If so, just add a reference
595 */
596 ret = btrfs_lookup_extent(root, ins.objectid,
597 ins.offset);
598 if (ret == 0) {
599 ret = btrfs_inc_extent_ref(trans, root,
600 ins.objectid, ins.offset,
601 path->nodes[0]->start,
602 root->root_key.objectid,
603 trans->transid, key->objectid);
604 } else {
605 /*
606 * insert the extent pointer in the extent
607 * allocation tree
608 */
609 ret = btrfs_alloc_logged_extent(trans, root,
610 path->nodes[0]->start,
611 root->root_key.objectid,
612 trans->transid, key->objectid,
613 &ins);
614 BUG_ON(ret);
615 }
616 btrfs_release_path(root, path);
617
618 if (btrfs_file_extent_compression(eb, item)) {
619 csum_start = ins.objectid;
620 csum_end = csum_start + ins.offset;
621 } else {
622 csum_start = ins.objectid +
623 btrfs_file_extent_offset(eb, item);
624 csum_end = csum_start +
625 btrfs_file_extent_num_bytes(eb, item);
626 }
627
628 ret = btrfs_lookup_csums_range(root->log_root,
629 csum_start, csum_end - 1,
630 &ordered_sums);
631 BUG_ON(ret);
632 while (!list_empty(&ordered_sums)) {
633 struct btrfs_ordered_sum *sums;
634 sums = list_entry(ordered_sums.next,
635 struct btrfs_ordered_sum,
636 list);
637 ret = btrfs_csum_file_blocks(trans,
638 root->fs_info->csum_root,
639 sums);
640 BUG_ON(ret);
641 list_del(&sums->list);
642 kfree(sums);
643 }
644 } else {
645 btrfs_release_path(root, path);
646 }
647 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
648 /* inline extents are easy, we just overwrite them */
649 ret = overwrite_item(trans, root, path, eb, slot, key);
650 BUG_ON(ret);
651 }
652
653 inode_set_bytes(inode, saved_nbytes);
654 btrfs_update_inode(trans, root, inode);
655out:
656 if (inode)
657 iput(inode);
658 return ret;
659}
660
661/*
662 * when cleaning up conflicts between the directory names in the
663 * subvolume, directory names in the log and directory names in the
664 * inode back references, we may have to unlink inodes from directories.
665 *
666 * This is a helper function to do the unlink of a specific directory
667 * item
668 */
669static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
670 struct btrfs_root *root,
671 struct btrfs_path *path,
672 struct inode *dir,
673 struct btrfs_dir_item *di)
674{
675 struct inode *inode;
676 char *name;
677 int name_len;
678 struct extent_buffer *leaf;
679 struct btrfs_key location;
680 int ret;
681
682 leaf = path->nodes[0];
683
684 btrfs_dir_item_key_to_cpu(leaf, di, &location);
685 name_len = btrfs_dir_name_len(leaf, di);
686 name = kmalloc(name_len, GFP_NOFS);
687 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
688 btrfs_release_path(root, path);
689
690 inode = read_one_inode(root, location.objectid);
691 BUG_ON(!inode);
692
693 ret = link_to_fixup_dir(trans, root, path, location.objectid);
694 BUG_ON(ret);
695 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
696 BUG_ON(ret);
697 kfree(name);
698
699 iput(inode);
700 return ret;
701}
702
703/*
704 * helper function to see if a given name and sequence number found
705 * in an inode back reference are already in a directory and correctly
706 * point to this inode
707 */
708static noinline int inode_in_dir(struct btrfs_root *root,
709 struct btrfs_path *path,
710 u64 dirid, u64 objectid, u64 index,
711 const char *name, int name_len)
712{
713 struct btrfs_dir_item *di;
714 struct btrfs_key location;
715 int match = 0;
716
717 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
718 index, name, name_len, 0);
719 if (di && !IS_ERR(di)) {
720 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
721 if (location.objectid != objectid)
722 goto out;
723 } else
724 goto out;
725 btrfs_release_path(root, path);
726
727 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
728 if (di && !IS_ERR(di)) {
729 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
730 if (location.objectid != objectid)
731 goto out;
732 } else
733 goto out;
734 match = 1;
735out:
736 btrfs_release_path(root, path);
737 return match;
738}
739
740/*
741 * helper function to check a log tree for a named back reference in
742 * an inode. This is used to decide if a back reference that is
743 * found in the subvolume conflicts with what we find in the log.
744 *
745 * inode backreferences may have multiple refs in a single item,
746 * during replay we process one reference at a time, and we don't
747 * want to delete valid links to a file from the subvolume if that
748 * link is also in the log.
749 */
750static noinline int backref_in_log(struct btrfs_root *log,
751 struct btrfs_key *key,
752 char *name, int namelen)
753{
754 struct btrfs_path *path;
755 struct btrfs_inode_ref *ref;
756 unsigned long ptr;
757 unsigned long ptr_end;
758 unsigned long name_ptr;
759 int found_name_len;
760 int item_size;
761 int ret;
762 int match = 0;
763
764 path = btrfs_alloc_path();
765 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
766 if (ret != 0)
767 goto out;
768
769 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
770 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
771 ptr_end = ptr + item_size;
772 while (ptr < ptr_end) {
773 ref = (struct btrfs_inode_ref *)ptr;
774 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
775 if (found_name_len == namelen) {
776 name_ptr = (unsigned long)(ref + 1);
777 ret = memcmp_extent_buffer(path->nodes[0], name,
778 name_ptr, namelen);
779 if (ret == 0) {
780 match = 1;
781 goto out;
782 }
783 }
784 ptr = (unsigned long)(ref + 1) + found_name_len;
785 }
786out:
787 btrfs_free_path(path);
788 return match;
789}
790
791
792/*
793 * replay one inode back reference item found in the log tree.
794 * eb, slot and key refer to the buffer and key found in the log tree.
795 * root is the destination we are replaying into, and path is for temp
796 * use by this function. (it should be released on return).
797 */
798static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct btrfs_root *root,
800 struct btrfs_root *log,
801 struct btrfs_path *path,
802 struct extent_buffer *eb, int slot,
803 struct btrfs_key *key)
804{
805 struct inode *dir;
806 int ret;
807 struct btrfs_key location;
808 struct btrfs_inode_ref *ref;
809 struct btrfs_dir_item *di;
810 struct inode *inode;
811 char *name;
812 int namelen;
813 unsigned long ref_ptr;
814 unsigned long ref_end;
815
816 location.objectid = key->objectid;
817 location.type = BTRFS_INODE_ITEM_KEY;
818 location.offset = 0;
819
820 /*
821 * it is possible that we didn't log all the parent directories
822 * for a given inode. If we don't find the dir, just don't
823 * copy the back ref in. The link count fixup code will take
824 * care of the rest
825 */
826 dir = read_one_inode(root, key->offset);
827 if (!dir)
828 return -ENOENT;
829
830 inode = read_one_inode(root, key->objectid);
831 BUG_ON(!dir);
832
833 ref_ptr = btrfs_item_ptr_offset(eb, slot);
834 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
835
836again:
837 ref = (struct btrfs_inode_ref *)ref_ptr;
838
839 namelen = btrfs_inode_ref_name_len(eb, ref);
840 name = kmalloc(namelen, GFP_NOFS);
841 BUG_ON(!name);
842
843 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
844
845 /* if we already have a perfect match, we're done */
846 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
847 btrfs_inode_ref_index(eb, ref),
848 name, namelen)) {
849 goto out;
850 }
851
852 /*
853 * look for a conflicting back reference in the metadata.
854 * if we find one we have to unlink that name of the file
855 * before we add our new link. Later on, we overwrite any
856 * existing back reference, and we don't want to create
857 * dangling pointers in the directory.
858 */
859conflict_again:
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr;
866 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0];
868
869 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done
871 */
872 if (key->objectid == key->offset)
873 goto out_nowrite;
874
875 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay
877 * otherwise they must be unlinked as a conflict
878 */
879 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
880 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
881 while (ptr < ptr_end) {
882 victim_ref = (struct btrfs_inode_ref *)ptr;
883 victim_name_len = btrfs_inode_ref_name_len(leaf,
884 victim_ref);
885 victim_name = kmalloc(victim_name_len, GFP_NOFS);
886 BUG_ON(!victim_name);
887
888 read_extent_buffer(leaf, victim_name,
889 (unsigned long)(victim_ref + 1),
890 victim_name_len);
891
892 if (!backref_in_log(log, key, victim_name,
893 victim_name_len)) {
894 btrfs_inc_nlink(inode);
895 btrfs_release_path(root, path);
896 ret = btrfs_unlink_inode(trans, root, dir,
897 inode, victim_name,
898 victim_name_len);
899 kfree(victim_name);
900 btrfs_release_path(root, path);
901 goto conflict_again;
902 }
903 kfree(victim_name);
904 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
905 }
906 BUG_ON(ret);
907 }
908 btrfs_release_path(root, path);
909
910 /* look for a conflicting sequence number */
911 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
912 btrfs_inode_ref_index(eb, ref),
913 name, namelen, 0);
914 if (di && !IS_ERR(di)) {
915 ret = drop_one_dir_item(trans, root, path, dir, di);
916 BUG_ON(ret);
917 }
918 btrfs_release_path(root, path);
919
920
921 /* look for a conflicting name */
922 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
923 name, namelen, 0);
924 if (di && !IS_ERR(di)) {
925 ret = drop_one_dir_item(trans, root, path, dir, di);
926 BUG_ON(ret);
927 }
928 btrfs_release_path(root, path);
929
930 /* insert our name */
931 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
932 btrfs_inode_ref_index(eb, ref));
933 BUG_ON(ret);
934
935 btrfs_update_inode(trans, root, inode);
936
937out:
938 ref_ptr = (unsigned long)(ref + 1) + namelen;
939 kfree(name);
940 if (ref_ptr < ref_end)
941 goto again;
942
943 /* finally write the back reference in the inode */
944 ret = overwrite_item(trans, root, path, eb, slot, key);
945 BUG_ON(ret);
946
947out_nowrite:
948 btrfs_release_path(root, path);
949 iput(dir);
950 iput(inode);
951 return 0;
952}
953
954/*
955 * There are a few corners where the link count of the file can't
956 * be properly maintained during replay. So, instead of adding
957 * lots of complexity to the log code, we just scan the backrefs
958 * for any file that has been through replay.
959 *
960 * The scan will update the link count on the inode to reflect the
961 * number of back refs found. If it goes down to zero, the iput
962 * will free the inode.
963 */
964static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
965 struct btrfs_root *root,
966 struct inode *inode)
967{
968 struct btrfs_path *path;
969 int ret;
970 struct btrfs_key key;
971 u64 nlink = 0;
972 unsigned long ptr;
973 unsigned long ptr_end;
974 int name_len;
975
976 key.objectid = inode->i_ino;
977 key.type = BTRFS_INODE_REF_KEY;
978 key.offset = (u64)-1;
979
980 path = btrfs_alloc_path();
981
982 while (1) {
983 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
984 if (ret < 0)
985 break;
986 if (ret > 0) {
987 if (path->slots[0] == 0)
988 break;
989 path->slots[0]--;
990 }
991 btrfs_item_key_to_cpu(path->nodes[0], &key,
992 path->slots[0]);
993 if (key.objectid != inode->i_ino ||
994 key.type != BTRFS_INODE_REF_KEY)
995 break;
996 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
997 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
998 path->slots[0]);
999 while (ptr < ptr_end) {
1000 struct btrfs_inode_ref *ref;
1001
1002 ref = (struct btrfs_inode_ref *)ptr;
1003 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1004 ref);
1005 ptr = (unsigned long)(ref + 1) + name_len;
1006 nlink++;
1007 }
1008
1009 if (key.offset == 0)
1010 break;
1011 key.offset--;
1012 btrfs_release_path(root, path);
1013 }
1014 btrfs_free_path(path);
1015 if (nlink != inode->i_nlink) {
1016 inode->i_nlink = nlink;
1017 btrfs_update_inode(trans, root, inode);
1018 }
1019 BTRFS_I(inode)->index_cnt = (u64)-1;
1020
1021 return 0;
1022}
1023
1024static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1025 struct btrfs_root *root,
1026 struct btrfs_path *path)
1027{
1028 int ret;
1029 struct btrfs_key key;
1030 struct inode *inode;
1031
1032 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1033 key.type = BTRFS_ORPHAN_ITEM_KEY;
1034 key.offset = (u64)-1;
1035 while (1) {
1036 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1037 if (ret < 0)
1038 break;
1039
1040 if (ret == 1) {
1041 if (path->slots[0] == 0)
1042 break;
1043 path->slots[0]--;
1044 }
1045
1046 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1047 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1048 key.type != BTRFS_ORPHAN_ITEM_KEY)
1049 break;
1050
1051 ret = btrfs_del_item(trans, root, path);
1052 BUG_ON(ret);
1053
1054 btrfs_release_path(root, path);
1055 inode = read_one_inode(root, key.offset);
1056 BUG_ON(!inode);
1057
1058 ret = fixup_inode_link_count(trans, root, inode);
1059 BUG_ON(ret);
1060
1061 iput(inode);
1062
1063 if (key.offset == 0)
1064 break;
1065 key.offset--;
1066 }
1067 btrfs_release_path(root, path);
1068 return 0;
1069}
1070
1071
1072/*
1073 * record a given inode in the fixup dir so we can check its link
1074 * count when replay is done. The link count is incremented here
1075 * so the inode won't go away until we check it
1076 */
1077static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path,
1080 u64 objectid)
1081{
1082 struct btrfs_key key;
1083 int ret = 0;
1084 struct inode *inode;
1085
1086 inode = read_one_inode(root, objectid);
1087 BUG_ON(!inode);
1088
1089 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1090 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1091 key.offset = objectid;
1092
1093 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1094
1095 btrfs_release_path(root, path);
1096 if (ret == 0) {
1097 btrfs_inc_nlink(inode);
1098 btrfs_update_inode(trans, root, inode);
1099 } else if (ret == -EEXIST) {
1100 ret = 0;
1101 } else {
1102 BUG();
1103 }
1104 iput(inode);
1105
1106 return ret;
1107}
1108
1109/*
1110 * when replaying the log for a directory, we only insert names
1111 * for inodes that actually exist. This means an fsync on a directory
1112 * does not implicitly fsync all the new files in it
1113 */
1114static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1115 struct btrfs_root *root,
1116 struct btrfs_path *path,
1117 u64 dirid, u64 index,
1118 char *name, int name_len, u8 type,
1119 struct btrfs_key *location)
1120{
1121 struct inode *inode;
1122 struct inode *dir;
1123 int ret;
1124
1125 inode = read_one_inode(root, location->objectid);
1126 if (!inode)
1127 return -ENOENT;
1128
1129 dir = read_one_inode(root, dirid);
1130 if (!dir) {
1131 iput(inode);
1132 return -EIO;
1133 }
1134 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1135
1136 /* FIXME, put inode into FIXUP list */
1137
1138 iput(inode);
1139 iput(dir);
1140 return ret;
1141}
1142
1143/*
1144 * take a single entry in a log directory item and replay it into
1145 * the subvolume.
1146 *
1147 * if a conflicting item exists in the subdirectory already,
1148 * the inode it points to is unlinked and put into the link count
1149 * fix up tree.
1150 *
1151 * If a name from the log points to a file or directory that does
1152 * not exist in the FS, it is skipped. fsyncs on directories
1153 * do not force down inodes inside that directory, just changes to the
1154 * names or unlinks in a directory.
1155 */
1156static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1157 struct btrfs_root *root,
1158 struct btrfs_path *path,
1159 struct extent_buffer *eb,
1160 struct btrfs_dir_item *di,
1161 struct btrfs_key *key)
1162{
1163 char *name;
1164 int name_len;
1165 struct btrfs_dir_item *dst_di;
1166 struct btrfs_key found_key;
1167 struct btrfs_key log_key;
1168 struct inode *dir;
1169 u8 log_type;
1170 int exists;
1171 int ret;
1172
1173 dir = read_one_inode(root, key->objectid);
1174 BUG_ON(!dir);
1175
1176 name_len = btrfs_dir_name_len(eb, di);
1177 name = kmalloc(name_len, GFP_NOFS);
1178 log_type = btrfs_dir_type(eb, di);
1179 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1180 name_len);
1181
1182 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1183 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1184 if (exists == 0)
1185 exists = 1;
1186 else
1187 exists = 0;
1188 btrfs_release_path(root, path);
1189
1190 if (key->type == BTRFS_DIR_ITEM_KEY) {
1191 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1192 name, name_len, 1);
1193 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1194 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1195 key->objectid,
1196 key->offset, name,
1197 name_len, 1);
1198 } else {
1199 BUG();
1200 }
1201 if (!dst_di || IS_ERR(dst_di)) {
1202 /* we need a sequence number to insert, so we only
1203 * do inserts for the BTRFS_DIR_INDEX_KEY types
1204 */
1205 if (key->type != BTRFS_DIR_INDEX_KEY)
1206 goto out;
1207 goto insert;
1208 }
1209
1210 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1211 /* the existing item matches the logged item */
1212 if (found_key.objectid == log_key.objectid &&
1213 found_key.type == log_key.type &&
1214 found_key.offset == log_key.offset &&
1215 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1216 goto out;
1217 }
1218
1219 /*
1220 * don't drop the conflicting directory entry if the inode
1221 * for the new entry doesn't exist
1222 */
1223 if (!exists)
1224 goto out;
1225
1226 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1227 BUG_ON(ret);
1228
1229 if (key->type == BTRFS_DIR_INDEX_KEY)
1230 goto insert;
1231out:
1232 btrfs_release_path(root, path);
1233 kfree(name);
1234 iput(dir);
1235 return 0;
1236
1237insert:
1238 btrfs_release_path(root, path);
1239 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1240 name, name_len, log_type, &log_key);
1241
1242 if (ret && ret != -ENOENT)
1243 BUG();
1244 goto out;
1245}
1246
1247/*
1248 * find all the names in a directory item and reconcile them into
1249 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1250 * one name in a directory item, but the same code gets used for
1251 * both directory index types
1252 */
1253static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root,
1255 struct btrfs_path *path,
1256 struct extent_buffer *eb, int slot,
1257 struct btrfs_key *key)
1258{
1259 int ret;
1260 u32 item_size = btrfs_item_size_nr(eb, slot);
1261 struct btrfs_dir_item *di;
1262 int name_len;
1263 unsigned long ptr;
1264 unsigned long ptr_end;
1265
1266 ptr = btrfs_item_ptr_offset(eb, slot);
1267 ptr_end = ptr + item_size;
1268 while (ptr < ptr_end) {
1269 di = (struct btrfs_dir_item *)ptr;
1270 name_len = btrfs_dir_name_len(eb, di);
1271 ret = replay_one_name(trans, root, path, eb, di, key);
1272 BUG_ON(ret);
1273 ptr = (unsigned long)(di + 1);
1274 ptr += name_len;
1275 }
1276 return 0;
1277}
1278
1279/*
1280 * directory replay has two parts. There are the standard directory
1281 * items in the log copied from the subvolume, and range items
1282 * created in the log while the subvolume was logged.
1283 *
1284 * The range items tell us which parts of the key space the log
1285 * is authoritative for. During replay, if a key in the subvolume
1286 * directory is in a logged range item, but not actually in the log
1287 * that means it was deleted from the directory before the fsync
1288 * and should be removed.
1289 */
1290static noinline int find_dir_range(struct btrfs_root *root,
1291 struct btrfs_path *path,
1292 u64 dirid, int key_type,
1293 u64 *start_ret, u64 *end_ret)
1294{
1295 struct btrfs_key key;
1296 u64 found_end;
1297 struct btrfs_dir_log_item *item;
1298 int ret;
1299 int nritems;
1300
1301 if (*start_ret == (u64)-1)
1302 return 1;
1303
1304 key.objectid = dirid;
1305 key.type = key_type;
1306 key.offset = *start_ret;
1307
1308 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1309 if (ret < 0)
1310 goto out;
1311 if (ret > 0) {
1312 if (path->slots[0] == 0)
1313 goto out;
1314 path->slots[0]--;
1315 }
1316 if (ret != 0)
1317 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1318
1319 if (key.type != key_type || key.objectid != dirid) {
1320 ret = 1;
1321 goto next;
1322 }
1323 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1324 struct btrfs_dir_log_item);
1325 found_end = btrfs_dir_log_end(path->nodes[0], item);
1326
1327 if (*start_ret >= key.offset && *start_ret <= found_end) {
1328 ret = 0;
1329 *start_ret = key.offset;
1330 *end_ret = found_end;
1331 goto out;
1332 }
1333 ret = 1;
1334next:
1335 /* check the next slot in the tree to see if it is a valid item */
1336 nritems = btrfs_header_nritems(path->nodes[0]);
1337 if (path->slots[0] >= nritems) {
1338 ret = btrfs_next_leaf(root, path);
1339 if (ret)
1340 goto out;
1341 } else {
1342 path->slots[0]++;
1343 }
1344
1345 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1346
1347 if (key.type != key_type || key.objectid != dirid) {
1348 ret = 1;
1349 goto out;
1350 }
1351 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1352 struct btrfs_dir_log_item);
1353 found_end = btrfs_dir_log_end(path->nodes[0], item);
1354 *start_ret = key.offset;
1355 *end_ret = found_end;
1356 ret = 0;
1357out:
1358 btrfs_release_path(root, path);
1359 return ret;
1360}
1361
1362/*
1363 * this looks for a given directory item in the log. If the directory
1364 * item is not in the log, the item is removed and the inode it points
1365 * to is unlinked
1366 */
1367static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1368 struct btrfs_root *root,
1369 struct btrfs_root *log,
1370 struct btrfs_path *path,
1371 struct btrfs_path *log_path,
1372 struct inode *dir,
1373 struct btrfs_key *dir_key)
1374{
1375 int ret;
1376 struct extent_buffer *eb;
1377 int slot;
1378 u32 item_size;
1379 struct btrfs_dir_item *di;
1380 struct btrfs_dir_item *log_di;
1381 int name_len;
1382 unsigned long ptr;
1383 unsigned long ptr_end;
1384 char *name;
1385 struct inode *inode;
1386 struct btrfs_key location;
1387
1388again:
1389 eb = path->nodes[0];
1390 slot = path->slots[0];
1391 item_size = btrfs_item_size_nr(eb, slot);
1392 ptr = btrfs_item_ptr_offset(eb, slot);
1393 ptr_end = ptr + item_size;
1394 while (ptr < ptr_end) {
1395 di = (struct btrfs_dir_item *)ptr;
1396 name_len = btrfs_dir_name_len(eb, di);
1397 name = kmalloc(name_len, GFP_NOFS);
1398 if (!name) {
1399 ret = -ENOMEM;
1400 goto out;
1401 }
1402 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1403 name_len);
1404 log_di = NULL;
1405 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1406 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1407 dir_key->objectid,
1408 name, name_len, 0);
1409 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1410 log_di = btrfs_lookup_dir_index_item(trans, log,
1411 log_path,
1412 dir_key->objectid,
1413 dir_key->offset,
1414 name, name_len, 0);
1415 }
1416 if (!log_di || IS_ERR(log_di)) {
1417 btrfs_dir_item_key_to_cpu(eb, di, &location);
1418 btrfs_release_path(root, path);
1419 btrfs_release_path(log, log_path);
1420 inode = read_one_inode(root, location.objectid);
1421 BUG_ON(!inode);
1422
1423 ret = link_to_fixup_dir(trans, root,
1424 path, location.objectid);
1425 BUG_ON(ret);
1426 btrfs_inc_nlink(inode);
1427 ret = btrfs_unlink_inode(trans, root, dir, inode,
1428 name, name_len);
1429 BUG_ON(ret);
1430 kfree(name);
1431 iput(inode);
1432
1433 /* there might still be more names under this key
1434 * check and repeat if required
1435 */
1436 ret = btrfs_search_slot(NULL, root, dir_key, path,
1437 0, 0);
1438 if (ret == 0)
1439 goto again;
1440 ret = 0;
1441 goto out;
1442 }
1443 btrfs_release_path(log, log_path);
1444 kfree(name);
1445
1446 ptr = (unsigned long)(di + 1);
1447 ptr += name_len;
1448 }
1449 ret = 0;
1450out:
1451 btrfs_release_path(root, path);
1452 btrfs_release_path(log, log_path);
1453 return ret;
1454}
1455
1456/*
1457 * deletion replay happens before we copy any new directory items
1458 * out of the log or out of backreferences from inodes. It
1459 * scans the log to find ranges of keys that log is authoritative for,
1460 * and then scans the directory to find items in those ranges that are
1461 * not present in the log.
1462 *
1463 * Anything we don't find in the log is unlinked and removed from the
1464 * directory.
1465 */
1466static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1467 struct btrfs_root *root,
1468 struct btrfs_root *log,
1469 struct btrfs_path *path,
1470 u64 dirid)
1471{
1472 u64 range_start;
1473 u64 range_end;
1474 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1475 int ret = 0;
1476 struct btrfs_key dir_key;
1477 struct btrfs_key found_key;
1478 struct btrfs_path *log_path;
1479 struct inode *dir;
1480
1481 dir_key.objectid = dirid;
1482 dir_key.type = BTRFS_DIR_ITEM_KEY;
1483 log_path = btrfs_alloc_path();
1484 if (!log_path)
1485 return -ENOMEM;
1486
1487 dir = read_one_inode(root, dirid);
1488 /* it isn't an error if the inode isn't there, that can happen
1489 * because we replay the deletes before we copy in the inode item
1490 * from the log
1491 */
1492 if (!dir) {
1493 btrfs_free_path(log_path);
1494 return 0;
1495 }
1496again:
1497 range_start = 0;
1498 range_end = 0;
1499 while (1) {
1500 ret = find_dir_range(log, path, dirid, key_type,
1501 &range_start, &range_end);
1502 if (ret != 0)
1503 break;
1504
1505 dir_key.offset = range_start;
1506 while (1) {
1507 int nritems;
1508 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1509 0, 0);
1510 if (ret < 0)
1511 goto out;
1512
1513 nritems = btrfs_header_nritems(path->nodes[0]);
1514 if (path->slots[0] >= nritems) {
1515 ret = btrfs_next_leaf(root, path);
1516 if (ret)
1517 break;
1518 }
1519 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1520 path->slots[0]);
1521 if (found_key.objectid != dirid ||
1522 found_key.type != dir_key.type)
1523 goto next_type;
1524
1525 if (found_key.offset > range_end)
1526 break;
1527
1528 ret = check_item_in_log(trans, root, log, path,
1529 log_path, dir, &found_key);
1530 BUG_ON(ret);
1531 if (found_key.offset == (u64)-1)
1532 break;
1533 dir_key.offset = found_key.offset + 1;
1534 }
1535 btrfs_release_path(root, path);
1536 if (range_end == (u64)-1)
1537 break;
1538 range_start = range_end + 1;
1539 }
1540
1541next_type:
1542 ret = 0;
1543 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1544 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1545 dir_key.type = BTRFS_DIR_INDEX_KEY;
1546 btrfs_release_path(root, path);
1547 goto again;
1548 }
1549out:
1550 btrfs_release_path(root, path);
1551 btrfs_free_path(log_path);
1552 iput(dir);
1553 return ret;
1554}
1555
1556/*
1557 * the process_func used to replay items from the log tree. This
1558 * gets called in two different stages. The first stage just looks
1559 * for inodes and makes sure they are all copied into the subvolume.
1560 *
1561 * The second stage copies all the other item types from the log into
1562 * the subvolume. The two stage approach is slower, but gets rid of
1563 * lots of complexity around inodes referencing other inodes that exist
1564 * only in the log (references come from either directory items or inode
1565 * back refs).
1566 */
1567static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1568 struct walk_control *wc, u64 gen)
1569{
1570 int nritems;
1571 struct btrfs_path *path;
1572 struct btrfs_root *root = wc->replay_dest;
1573 struct btrfs_key key;
1574 u32 item_size;
1575 int level;
1576 int i;
1577 int ret;
1578
1579 btrfs_read_buffer(eb, gen);
1580
1581 level = btrfs_header_level(eb);
1582
1583 if (level != 0)
1584 return 0;
1585
1586 path = btrfs_alloc_path();
1587 BUG_ON(!path);
1588
1589 nritems = btrfs_header_nritems(eb);
1590 for (i = 0; i < nritems; i++) {
1591 btrfs_item_key_to_cpu(eb, &key, i);
1592 item_size = btrfs_item_size_nr(eb, i);
1593
1594 /* inode keys are done during the first stage */
1595 if (key.type == BTRFS_INODE_ITEM_KEY &&
1596 wc->stage == LOG_WALK_REPLAY_INODES) {
1597 struct inode *inode;
1598 struct btrfs_inode_item *inode_item;
1599 u32 mode;
1600
1601 inode_item = btrfs_item_ptr(eb, i,
1602 struct btrfs_inode_item);
1603 mode = btrfs_inode_mode(eb, inode_item);
1604 if (S_ISDIR(mode)) {
1605 ret = replay_dir_deletes(wc->trans,
1606 root, log, path, key.objectid);
1607 BUG_ON(ret);
1608 }
1609 ret = overwrite_item(wc->trans, root, path,
1610 eb, i, &key);
1611 BUG_ON(ret);
1612
1613 /* for regular files, truncate away
1614 * extents past the new EOF
1615 */
1616 if (S_ISREG(mode)) {
1617 inode = read_one_inode(root,
1618 key.objectid);
1619 BUG_ON(!inode);
1620
1621 ret = btrfs_truncate_inode_items(wc->trans,
1622 root, inode, inode->i_size,
1623 BTRFS_EXTENT_DATA_KEY);
1624 BUG_ON(ret);
1625 iput(inode);
1626 }
1627 ret = link_to_fixup_dir(wc->trans, root,
1628 path, key.objectid);
1629 BUG_ON(ret);
1630 }
1631 if (wc->stage < LOG_WALK_REPLAY_ALL)
1632 continue;
1633
1634 /* these keys are simply copied */
1635 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1636 ret = overwrite_item(wc->trans, root, path,
1637 eb, i, &key);
1638 BUG_ON(ret);
1639 } else if (key.type == BTRFS_INODE_REF_KEY) {
1640 ret = add_inode_ref(wc->trans, root, log, path,
1641 eb, i, &key);
1642 BUG_ON(ret && ret != -ENOENT);
1643 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1644 ret = replay_one_extent(wc->trans, root, path,
1645 eb, i, &key);
1646 BUG_ON(ret);
1647 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1648 key.type == BTRFS_DIR_INDEX_KEY) {
1649 ret = replay_one_dir_item(wc->trans, root, path,
1650 eb, i, &key);
1651 BUG_ON(ret);
1652 }
1653 }
1654 btrfs_free_path(path);
1655 return 0;
1656}
1657
1658static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1659 struct btrfs_root *root,
1660 struct btrfs_path *path, int *level,
1661 struct walk_control *wc)
1662{
1663 u64 root_owner;
1664 u64 root_gen;
1665 u64 bytenr;
1666 u64 ptr_gen;
1667 struct extent_buffer *next;
1668 struct extent_buffer *cur;
1669 struct extent_buffer *parent;
1670 u32 blocksize;
1671 int ret = 0;
1672
1673 WARN_ON(*level < 0);
1674 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1675
1676 while (*level > 0) {
1677 WARN_ON(*level < 0);
1678 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1679 cur = path->nodes[*level];
1680
1681 if (btrfs_header_level(cur) != *level)
1682 WARN_ON(1);
1683
1684 if (path->slots[*level] >=
1685 btrfs_header_nritems(cur))
1686 break;
1687
1688 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1689 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1690 blocksize = btrfs_level_size(root, *level - 1);
1691
1692 parent = path->nodes[*level];
1693 root_owner = btrfs_header_owner(parent);
1694 root_gen = btrfs_header_generation(parent);
1695
1696 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1697
1698 wc->process_func(root, next, wc, ptr_gen);
1699
1700 if (*level == 1) {
1701 path->slots[*level]++;
1702 if (wc->free) {
1703 btrfs_read_buffer(next, ptr_gen);
1704
1705 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next);
1707 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next);
1709
1710 ret = btrfs_drop_leaf_ref(trans, root, next);
1711 BUG_ON(ret);
1712
1713 WARN_ON(root_owner !=
1714 BTRFS_TREE_LOG_OBJECTID);
1715 ret = btrfs_free_reserved_extent(root,
1716 bytenr, blocksize);
1717 BUG_ON(ret);
1718 }
1719 free_extent_buffer(next);
1720 continue;
1721 }
1722 btrfs_read_buffer(next, ptr_gen);
1723
1724 WARN_ON(*level <= 0);
1725 if (path->nodes[*level-1])
1726 free_extent_buffer(path->nodes[*level-1]);
1727 path->nodes[*level-1] = next;
1728 *level = btrfs_header_level(next);
1729 path->slots[*level] = 0;
1730 cond_resched();
1731 }
1732 WARN_ON(*level < 0);
1733 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1734
1735 if (path->nodes[*level] == root->node)
1736 parent = path->nodes[*level];
1737 else
1738 parent = path->nodes[*level + 1];
1739
1740 bytenr = path->nodes[*level]->start;
1741
1742 blocksize = btrfs_level_size(root, *level);
1743 root_owner = btrfs_header_owner(parent);
1744 root_gen = btrfs_header_generation(parent);
1745
1746 wc->process_func(root, path->nodes[*level], wc,
1747 btrfs_header_generation(path->nodes[*level]));
1748
1749 if (wc->free) {
1750 next = path->nodes[*level];
1751 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next);
1753 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next);
1755
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret);
1763 }
1764 free_extent_buffer(path->nodes[*level]);
1765 path->nodes[*level] = NULL;
1766 *level += 1;
1767
1768 cond_resched();
1769 return 0;
1770}
1771
1772static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root,
1774 struct btrfs_path *path, int *level,
1775 struct walk_control *wc)
1776{
1777 u64 root_owner;
1778 u64 root_gen;
1779 int i;
1780 int slot;
1781 int ret;
1782
1783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1784 slot = path->slots[i];
1785 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1786 struct extent_buffer *node;
1787 node = path->nodes[i];
1788 path->slots[i]++;
1789 *level = i;
1790 WARN_ON(*level == 0);
1791 return 0;
1792 } else {
1793 struct extent_buffer *parent;
1794 if (path->nodes[*level] == root->node)
1795 parent = path->nodes[*level];
1796 else
1797 parent = path->nodes[*level + 1];
1798
1799 root_owner = btrfs_header_owner(parent);
1800 root_gen = btrfs_header_generation(parent);
1801 wc->process_func(root, path->nodes[*level], wc,
1802 btrfs_header_generation(path->nodes[*level]));
1803 if (wc->free) {
1804 struct extent_buffer *next;
1805
1806 next = path->nodes[*level];
1807
1808 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next);
1810 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next);
1812
1813 if (*level == 0) {
1814 ret = btrfs_drop_leaf_ref(trans, root,
1815 next);
1816 BUG_ON(ret);
1817 }
1818
1819 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1820 ret = btrfs_free_reserved_extent(root,
1821 path->nodes[*level]->start,
1822 path->nodes[*level]->len);
1823 BUG_ON(ret);
1824 }
1825 free_extent_buffer(path->nodes[*level]);
1826 path->nodes[*level] = NULL;
1827 *level = i + 1;
1828 }
1829 }
1830 return 1;
1831}
1832
1833/*
1834 * drop the reference count on the tree rooted at 'snap'. This traverses
1835 * the tree freeing any blocks that have a ref count of zero after being
1836 * decremented.
1837 */
1838static int walk_log_tree(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *log, struct walk_control *wc)
1840{
1841 int ret = 0;
1842 int wret;
1843 int level;
1844 struct btrfs_path *path;
1845 int i;
1846 int orig_level;
1847
1848 path = btrfs_alloc_path();
1849 BUG_ON(!path);
1850
1851 level = btrfs_header_level(log->node);
1852 orig_level = level;
1853 path->nodes[level] = log->node;
1854 extent_buffer_get(log->node);
1855 path->slots[level] = 0;
1856
1857 while (1) {
1858 wret = walk_down_log_tree(trans, log, path, &level, wc);
1859 if (wret > 0)
1860 break;
1861 if (wret < 0)
1862 ret = wret;
1863
1864 wret = walk_up_log_tree(trans, log, path, &level, wc);
1865 if (wret > 0)
1866 break;
1867 if (wret < 0)
1868 ret = wret;
1869 }
1870
1871 /* was the root node processed? if not, catch it here */
1872 if (path->nodes[orig_level]) {
1873 wc->process_func(log, path->nodes[orig_level], wc,
1874 btrfs_header_generation(path->nodes[orig_level]));
1875 if (wc->free) {
1876 struct extent_buffer *next;
1877
1878 next = path->nodes[orig_level];
1879
1880 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next);
1882 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next);
1884
1885 if (orig_level == 0) {
1886 ret = btrfs_drop_leaf_ref(trans, log,
1887 next);
1888 BUG_ON(ret);
1889 }
1890 WARN_ON(log->root_key.objectid !=
1891 BTRFS_TREE_LOG_OBJECTID);
1892 ret = btrfs_free_reserved_extent(log, next->start,
1893 next->len);
1894 BUG_ON(ret);
1895 }
1896 }
1897
1898 for (i = 0; i <= orig_level; i++) {
1899 if (path->nodes[i]) {
1900 free_extent_buffer(path->nodes[i]);
1901 path->nodes[i] = NULL;
1902 }
1903 }
1904 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret;
1908}
1909
1910static int wait_log_commit(struct btrfs_root *log)
1911{
1912 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid;
1914
1915 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1917 TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit))
1920 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid &&
1924 atomic_read(&log->fs_info->tree_log_commit));
1925 return 0;
1926}
1927
1928/*
1929 * btrfs_sync_log does sends a given tree log down to the disk and
1930 * updates the super blocks to record it. When this call is done,
1931 * you know that any inodes previously logged are safely on disk
1932 */
1933int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root)
1935{
1936 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root;
1939
1940 mutex_lock(&log->fs_info->tree_log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) {
1942 wait_log_commit(log);
1943 goto out;
1944 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1);
1946
1947 while (1) {
1948 batch = log->fs_info->tree_log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex);
1950 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex);
1952
1953 while (atomic_read(&log->fs_info->tree_log_writers)) {
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break;
1965 }
1966
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1970 &root->fs_info->log_root_tree->dirty_log_pages);
1971 BUG_ON(ret);
1972
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node));
1977
1978 write_ctree_super(trans, log->fs_info->tree_root, 2);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait))
1984 wake_up(&log->fs_info->tree_log_wait);
1985out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex);
1987 return 0;
1988}
1989
1990/* * free all the extents used by the tree log. This should be called
1991 * at commit time of the full transaction
1992 */
1993int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
1994{
1995 int ret;
1996 struct btrfs_root *log;
1997 struct key;
1998 u64 start;
1999 u64 end;
2000 struct walk_control wc = {
2001 .free = 1,
2002 .process_func = process_one_buffer
2003 };
2004
2005 if (!root->log_root || root->fs_info->log_root_recovering)
2006 return 0;
2007
2008 log = root->log_root;
2009 ret = walk_log_tree(trans, log, &wc);
2010 BUG_ON(ret);
2011
2012 while (1) {
2013 ret = find_first_extent_bit(&log->dirty_log_pages,
2014 0, &start, &end, EXTENT_DIRTY);
2015 if (ret)
2016 break;
2017
2018 clear_extent_dirty(&log->dirty_log_pages,
2019 start, end, GFP_NOFS);
2020 }
2021
2022 log = root->log_root;
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key);
2025 BUG_ON(ret);
2026 root->log_root = NULL;
2027 kfree(root->log_root);
2028 return 0;
2029}
2030
2031/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners:
2056 *
2057 * create file X in dir Y
2058 * link file X to X.link in dir Y
2059 * fsync file X
2060 * unlink file X but leave X.link
2061 * fsync dir Y
2062 *
2063 * After a crash we would expect only X.link to exist. But file X
2064 * didn't get fsync'd again so the log has back refs for X and X.link.
2065 *
2066 * We solve this by removing directory entries and inode backrefs from the
2067 * log when a file that was logged in the current transaction is
2068 * unlinked. Any later fsync will include the updated log entries, and
2069 * we'll be able to reconstruct the proper directory items from backrefs.
2070 *
2071 * This optimizations allows us to avoid relogging the entire inode
2072 * or the entire directory.
2073 */
2074int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2075 struct btrfs_root *root,
2076 const char *name, int name_len,
2077 struct inode *dir, u64 index)
2078{
2079 struct btrfs_root *log;
2080 struct btrfs_dir_item *di;
2081 struct btrfs_path *path;
2082 int ret;
2083 int bytes_del = 0;
2084
2085 if (BTRFS_I(dir)->logged_trans < trans->transid)
2086 return 0;
2087
2088 ret = join_running_log_trans(root);
2089 if (ret)
2090 return 0;
2091
2092 mutex_lock(&BTRFS_I(dir)->log_mutex);
2093
2094 log = root->log_root;
2095 path = btrfs_alloc_path();
2096 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2097 name, name_len, -1);
2098 if (di && !IS_ERR(di)) {
2099 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2100 bytes_del += name_len;
2101 BUG_ON(ret);
2102 }
2103 btrfs_release_path(log, path);
2104 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2105 index, name, name_len, -1);
2106 if (di && !IS_ERR(di)) {
2107 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2108 bytes_del += name_len;
2109 BUG_ON(ret);
2110 }
2111
2112 /* update the directory size in the log to reflect the names
2113 * we have removed
2114 */
2115 if (bytes_del) {
2116 struct btrfs_key key;
2117
2118 key.objectid = dir->i_ino;
2119 key.offset = 0;
2120 key.type = BTRFS_INODE_ITEM_KEY;
2121 btrfs_release_path(log, path);
2122
2123 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2124 if (ret == 0) {
2125 struct btrfs_inode_item *item;
2126 u64 i_size;
2127
2128 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2129 struct btrfs_inode_item);
2130 i_size = btrfs_inode_size(path->nodes[0], item);
2131 if (i_size > bytes_del)
2132 i_size -= bytes_del;
2133 else
2134 i_size = 0;
2135 btrfs_set_inode_size(path->nodes[0], item, i_size);
2136 btrfs_mark_buffer_dirty(path->nodes[0]);
2137 } else
2138 ret = 0;
2139 btrfs_release_path(log, path);
2140 }
2141
2142 btrfs_free_path(path);
2143 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2144 end_log_trans(root);
2145
2146 return 0;
2147}
2148
2149/* see comments for btrfs_del_dir_entries_in_log */
2150int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2151 struct btrfs_root *root,
2152 const char *name, int name_len,
2153 struct inode *inode, u64 dirid)
2154{
2155 struct btrfs_root *log;
2156 u64 index;
2157 int ret;
2158
2159 if (BTRFS_I(inode)->logged_trans < trans->transid)
2160 return 0;
2161
2162 ret = join_running_log_trans(root);
2163 if (ret)
2164 return 0;
2165 log = root->log_root;
2166 mutex_lock(&BTRFS_I(inode)->log_mutex);
2167
2168 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2169 dirid, &index);
2170 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2171 end_log_trans(root);
2172
2173 return ret;
2174}
2175
2176/*
2177 * creates a range item in the log for 'dirid'. first_offset and
2178 * last_offset tell us which parts of the key space the log should
2179 * be considered authoritative for.
2180 */
2181static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2182 struct btrfs_root *log,
2183 struct btrfs_path *path,
2184 int key_type, u64 dirid,
2185 u64 first_offset, u64 last_offset)
2186{
2187 int ret;
2188 struct btrfs_key key;
2189 struct btrfs_dir_log_item *item;
2190
2191 key.objectid = dirid;
2192 key.offset = first_offset;
2193 if (key_type == BTRFS_DIR_ITEM_KEY)
2194 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2195 else
2196 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2197 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2198 BUG_ON(ret);
2199
2200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2201 struct btrfs_dir_log_item);
2202 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2203 btrfs_mark_buffer_dirty(path->nodes[0]);
2204 btrfs_release_path(log, path);
2205 return 0;
2206}
2207
2208/*
2209 * log all the items included in the current transaction for a given
2210 * directory. This also creates the range items in the log tree required
2211 * to replay anything deleted before the fsync
2212 */
2213static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2214 struct btrfs_root *root, struct inode *inode,
2215 struct btrfs_path *path,
2216 struct btrfs_path *dst_path, int key_type,
2217 u64 min_offset, u64 *last_offset_ret)
2218{
2219 struct btrfs_key min_key;
2220 struct btrfs_key max_key;
2221 struct btrfs_root *log = root->log_root;
2222 struct extent_buffer *src;
2223 int ret;
2224 int i;
2225 int nritems;
2226 u64 first_offset = min_offset;
2227 u64 last_offset = (u64)-1;
2228
2229 log = root->log_root;
2230 max_key.objectid = inode->i_ino;
2231 max_key.offset = (u64)-1;
2232 max_key.type = key_type;
2233
2234 min_key.objectid = inode->i_ino;
2235 min_key.type = key_type;
2236 min_key.offset = min_offset;
2237
2238 path->keep_locks = 1;
2239
2240 ret = btrfs_search_forward(root, &min_key, &max_key,
2241 path, 0, trans->transid);
2242
2243 /*
2244 * we didn't find anything from this transaction, see if there
2245 * is anything at all
2246 */
2247 if (ret != 0 || min_key.objectid != inode->i_ino ||
2248 min_key.type != key_type) {
2249 min_key.objectid = inode->i_ino;
2250 min_key.type = key_type;
2251 min_key.offset = (u64)-1;
2252 btrfs_release_path(root, path);
2253 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2254 if (ret < 0) {
2255 btrfs_release_path(root, path);
2256 return ret;
2257 }
2258 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2259
2260 /* if ret == 0 there are items for this type,
2261 * create a range to tell us the last key of this type.
2262 * otherwise, there are no items in this directory after
2263 * *min_offset, and we create a range to indicate that.
2264 */
2265 if (ret == 0) {
2266 struct btrfs_key tmp;
2267 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2268 path->slots[0]);
2269 if (key_type == tmp.type)
2270 first_offset = max(min_offset, tmp.offset) + 1;
2271 }
2272 goto done;
2273 }
2274
2275 /* go backward to find any previous key */
2276 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2277 if (ret == 0) {
2278 struct btrfs_key tmp;
2279 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2280 if (key_type == tmp.type) {
2281 first_offset = tmp.offset;
2282 ret = overwrite_item(trans, log, dst_path,
2283 path->nodes[0], path->slots[0],
2284 &tmp);
2285 }
2286 }
2287 btrfs_release_path(root, path);
2288
2289 /* find the first key from this transaction again */
2290 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2291 if (ret != 0) {
2292 WARN_ON(1);
2293 goto done;
2294 }
2295
2296 /*
2297 * we have a block from this transaction, log every item in it
2298 * from our directory
2299 */
2300 while (1) {
2301 struct btrfs_key tmp;
2302 src = path->nodes[0];
2303 nritems = btrfs_header_nritems(src);
2304 for (i = path->slots[0]; i < nritems; i++) {
2305 btrfs_item_key_to_cpu(src, &min_key, i);
2306
2307 if (min_key.objectid != inode->i_ino ||
2308 min_key.type != key_type)
2309 goto done;
2310 ret = overwrite_item(trans, log, dst_path, src, i,
2311 &min_key);
2312 BUG_ON(ret);
2313 }
2314 path->slots[0] = nritems;
2315
2316 /*
2317 * look ahead to the next item and see if it is also
2318 * from this directory and from this transaction
2319 */
2320 ret = btrfs_next_leaf(root, path);
2321 if (ret == 1) {
2322 last_offset = (u64)-1;
2323 goto done;
2324 }
2325 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2326 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2327 last_offset = (u64)-1;
2328 goto done;
2329 }
2330 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2331 ret = overwrite_item(trans, log, dst_path,
2332 path->nodes[0], path->slots[0],
2333 &tmp);
2334
2335 BUG_ON(ret);
2336 last_offset = tmp.offset;
2337 goto done;
2338 }
2339 }
2340done:
2341 *last_offset_ret = last_offset;
2342 btrfs_release_path(root, path);
2343 btrfs_release_path(log, dst_path);
2344
2345 /* insert the log range keys to indicate where the log is valid */
2346 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2347 first_offset, last_offset);
2348 BUG_ON(ret);
2349 return 0;
2350}
2351
2352/*
2353 * logging directories is very similar to logging inodes, We find all the items
2354 * from the current transaction and write them to the log.
2355 *
2356 * The recovery code scans the directory in the subvolume, and if it finds a
2357 * key in the range logged that is not present in the log tree, then it means
2358 * that dir entry was unlinked during the transaction.
2359 *
2360 * In order for that scan to work, we must include one key smaller than
2361 * the smallest logged by this transaction and one key larger than the largest
2362 * key logged by this transaction.
2363 */
2364static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2365 struct btrfs_root *root, struct inode *inode,
2366 struct btrfs_path *path,
2367 struct btrfs_path *dst_path)
2368{
2369 u64 min_key;
2370 u64 max_key;
2371 int ret;
2372 int key_type = BTRFS_DIR_ITEM_KEY;
2373
2374again:
2375 min_key = 0;
2376 max_key = 0;
2377 while (1) {
2378 ret = log_dir_items(trans, root, inode, path,
2379 dst_path, key_type, min_key,
2380 &max_key);
2381 BUG_ON(ret);
2382 if (max_key == (u64)-1)
2383 break;
2384 min_key = max_key + 1;
2385 }
2386
2387 if (key_type == BTRFS_DIR_ITEM_KEY) {
2388 key_type = BTRFS_DIR_INDEX_KEY;
2389 goto again;
2390 }
2391 return 0;
2392}
2393
2394/*
2395 * a helper function to drop items from the log before we relog an
2396 * inode. max_key_type indicates the highest item type to remove.
2397 * This cannot be run for file data extents because it does not
2398 * free the extents they point to.
2399 */
2400static int drop_objectid_items(struct btrfs_trans_handle *trans,
2401 struct btrfs_root *log,
2402 struct btrfs_path *path,
2403 u64 objectid, int max_key_type)
2404{
2405 int ret;
2406 struct btrfs_key key;
2407 struct btrfs_key found_key;
2408
2409 key.objectid = objectid;
2410 key.type = max_key_type;
2411 key.offset = (u64)-1;
2412
2413 while (1) {
2414 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2415
2416 if (ret != 1)
2417 break;
2418
2419 if (path->slots[0] == 0)
2420 break;
2421
2422 path->slots[0]--;
2423 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2424 path->slots[0]);
2425
2426 if (found_key.objectid != objectid)
2427 break;
2428
2429 ret = btrfs_del_item(trans, log, path);
2430 BUG_ON(ret);
2431 btrfs_release_path(log, path);
2432 }
2433 btrfs_release_path(log, path);
2434 return 0;
2435}
2436
2437static noinline int copy_items(struct btrfs_trans_handle *trans,
2438 struct btrfs_root *log,
2439 struct btrfs_path *dst_path,
2440 struct extent_buffer *src,
2441 int start_slot, int nr, int inode_only)
2442{
2443 unsigned long src_offset;
2444 unsigned long dst_offset;
2445 struct btrfs_file_extent_item *extent;
2446 struct btrfs_inode_item *inode_item;
2447 int ret;
2448 struct btrfs_key *ins_keys;
2449 u32 *ins_sizes;
2450 char *ins_data;
2451 int i;
2452 struct list_head ordered_sums;
2453
2454 INIT_LIST_HEAD(&ordered_sums);
2455
2456 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2457 nr * sizeof(u32), GFP_NOFS);
2458 ins_sizes = (u32 *)ins_data;
2459 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2460
2461 for (i = 0; i < nr; i++) {
2462 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2463 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2464 }
2465 ret = btrfs_insert_empty_items(trans, log, dst_path,
2466 ins_keys, ins_sizes, nr);
2467 BUG_ON(ret);
2468
2469 for (i = 0; i < nr; i++) {
2470 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2471 dst_path->slots[0]);
2472
2473 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2474
2475 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2476 src_offset, ins_sizes[i]);
2477
2478 if (inode_only == LOG_INODE_EXISTS &&
2479 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2480 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2481 dst_path->slots[0],
2482 struct btrfs_inode_item);
2483 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2484
2485 /* set the generation to zero so the recover code
2486 * can tell the difference between an logging
2487 * just to say 'this inode exists' and a logging
2488 * to say 'update this inode with these values'
2489 */
2490 btrfs_set_inode_generation(dst_path->nodes[0],
2491 inode_item, 0);
2492 }
2493 /* take a reference on file data extents so that truncates
2494 * or deletes of this inode don't have to relog the inode
2495 * again
2496 */
2497 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2498 int found_type;
2499 extent = btrfs_item_ptr(src, start_slot + i,
2500 struct btrfs_file_extent_item);
2501
2502 found_type = btrfs_file_extent_type(src, extent);
2503 if (found_type == BTRFS_FILE_EXTENT_REG ||
2504 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2505 u64 ds = btrfs_file_extent_disk_bytenr(src,
2506 extent);
2507 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2508 extent);
2509 u64 cs = btrfs_file_extent_offset(src, extent);
2510 u64 cl = btrfs_file_extent_num_bytes(src,
2511 extent);;
2512 if (btrfs_file_extent_compression(src,
2513 extent)) {
2514 cs = 0;
2515 cl = dl;
2516 }
2517 /* ds == 0 is a hole */
2518 if (ds != 0) {
2519 ret = btrfs_inc_extent_ref(trans, log,
2520 ds, dl,
2521 dst_path->nodes[0]->start,
2522 BTRFS_TREE_LOG_OBJECTID,
2523 trans->transid,
2524 ins_keys[i].objectid);
2525 BUG_ON(ret);
2526 ret = btrfs_lookup_csums_range(
2527 log->fs_info->csum_root,
2528 ds + cs, ds + cs + cl - 1,
2529 &ordered_sums);
2530 BUG_ON(ret);
2531 }
2532 }
2533 }
2534 dst_path->slots[0]++;
2535 }
2536
2537 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2538 btrfs_release_path(log, dst_path);
2539 kfree(ins_data);
2540
2541 /*
2542 * we have to do this after the loop above to avoid changing the
2543 * log tree while trying to change the log tree.
2544 */
2545 while (!list_empty(&ordered_sums)) {
2546 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2547 struct btrfs_ordered_sum,
2548 list);
2549 ret = btrfs_csum_file_blocks(trans, log, sums);
2550 BUG_ON(ret);
2551 list_del(&sums->list);
2552 kfree(sums);
2553 }
2554 return 0;
2555}
2556
2557/* log a single inode in the tree log.
2558 * At least one parent directory for this inode must exist in the tree
2559 * or be logged already.
2560 *
2561 * Any items from this inode changed by the current transaction are copied
2562 * to the log tree. An extra reference is taken on any extents in this
2563 * file, allowing us to avoid a whole pile of corner cases around logging
2564 * blocks that have been removed from the tree.
2565 *
2566 * See LOG_INODE_ALL and related defines for a description of what inode_only
2567 * does.
2568 *
2569 * This handles both files and directories.
2570 */
2571static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2572 struct btrfs_root *root, struct inode *inode,
2573 int inode_only)
2574{
2575 struct btrfs_path *path;
2576 struct btrfs_path *dst_path;
2577 struct btrfs_key min_key;
2578 struct btrfs_key max_key;
2579 struct btrfs_root *log = root->log_root;
2580 struct extent_buffer *src = NULL;
2581 u32 size;
2582 int ret;
2583 int nritems;
2584 int ins_start_slot = 0;
2585 int ins_nr;
2586
2587 log = root->log_root;
2588
2589 path = btrfs_alloc_path();
2590 dst_path = btrfs_alloc_path();
2591
2592 min_key.objectid = inode->i_ino;
2593 min_key.type = BTRFS_INODE_ITEM_KEY;
2594 min_key.offset = 0;
2595
2596 max_key.objectid = inode->i_ino;
2597 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2598 max_key.type = BTRFS_XATTR_ITEM_KEY;
2599 else
2600 max_key.type = (u8)-1;
2601 max_key.offset = (u64)-1;
2602
2603 /*
2604 * if this inode has already been logged and we're in inode_only
2605 * mode, we don't want to delete the things that have already
2606 * been written to the log.
2607 *
2608 * But, if the inode has been through an inode_only log,
2609 * the logged_trans field is not set. This allows us to catch
2610 * any new names for this inode in the backrefs by logging it
2611 * again
2612 */
2613 if (inode_only == LOG_INODE_EXISTS &&
2614 BTRFS_I(inode)->logged_trans == trans->transid) {
2615 btrfs_free_path(path);
2616 btrfs_free_path(dst_path);
2617 goto out;
2618 }
2619 mutex_lock(&BTRFS_I(inode)->log_mutex);
2620
2621 /*
2622 * a brute force approach to making sure we get the most uptodate
2623 * copies of everything.
2624 */
2625 if (S_ISDIR(inode->i_mode)) {
2626 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2627
2628 if (inode_only == LOG_INODE_EXISTS)
2629 max_key_type = BTRFS_XATTR_ITEM_KEY;
2630 ret = drop_objectid_items(trans, log, path,
2631 inode->i_ino, max_key_type);
2632 } else {
2633 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2634 }
2635 BUG_ON(ret);
2636 path->keep_locks = 1;
2637
2638 while (1) {
2639 ins_nr = 0;
2640 ret = btrfs_search_forward(root, &min_key, &max_key,
2641 path, 0, trans->transid);
2642 if (ret != 0)
2643 break;
2644again:
2645 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2646 if (min_key.objectid != inode->i_ino)
2647 break;
2648 if (min_key.type > max_key.type)
2649 break;
2650
2651 src = path->nodes[0];
2652 size = btrfs_item_size_nr(src, path->slots[0]);
2653 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2654 ins_nr++;
2655 goto next_slot;
2656 } else if (!ins_nr) {
2657 ins_start_slot = path->slots[0];
2658 ins_nr = 1;
2659 goto next_slot;
2660 }
2661
2662 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2663 ins_nr, inode_only);
2664 BUG_ON(ret);
2665 ins_nr = 1;
2666 ins_start_slot = path->slots[0];
2667next_slot:
2668
2669 nritems = btrfs_header_nritems(path->nodes[0]);
2670 path->slots[0]++;
2671 if (path->slots[0] < nritems) {
2672 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2673 path->slots[0]);
2674 goto again;
2675 }
2676 if (ins_nr) {
2677 ret = copy_items(trans, log, dst_path, src,
2678 ins_start_slot,
2679 ins_nr, inode_only);
2680 BUG_ON(ret);
2681 ins_nr = 0;
2682 }
2683 btrfs_release_path(root, path);
2684
2685 if (min_key.offset < (u64)-1)
2686 min_key.offset++;
2687 else if (min_key.type < (u8)-1)
2688 min_key.type++;
2689 else if (min_key.objectid < (u64)-1)
2690 min_key.objectid++;
2691 else
2692 break;
2693 }
2694 if (ins_nr) {
2695 ret = copy_items(trans, log, dst_path, src,
2696 ins_start_slot,
2697 ins_nr, inode_only);
2698 BUG_ON(ret);
2699 ins_nr = 0;
2700 }
2701 WARN_ON(ins_nr);
2702 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2703 btrfs_release_path(root, path);
2704 btrfs_release_path(log, dst_path);
2705 BTRFS_I(inode)->log_dirty_trans = 0;
2706 ret = log_directory_changes(trans, root, inode, path, dst_path);
2707 BUG_ON(ret);
2708 }
2709 BTRFS_I(inode)->logged_trans = trans->transid;
2710 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2711
2712 btrfs_free_path(path);
2713 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out:
2720 return 0;
2721}
2722
2723int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 struct btrfs_root *root, struct inode *inode,
2725 int inode_only)
2726{
2727 int ret;
2728
2729 start_log_trans(trans, root);
2730 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2731 end_log_trans(root);
2732 return ret;
2733}
2734
2735/*
2736 * helper function around btrfs_log_inode to make sure newly created
2737 * parent directories also end up in the log. A minimal inode and backref
2738 * only logging is done of any parent directories that are older than
2739 * the last committed transaction
2740 */
2741int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2742 struct btrfs_root *root, struct dentry *dentry)
2743{
2744 int inode_only = LOG_INODE_ALL;
2745 struct super_block *sb;
2746 int ret;
2747
2748 start_log_trans(trans, root);
2749 sb = dentry->d_inode->i_sb;
2750 while (1) {
2751 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2752 inode_only);
2753 BUG_ON(ret);
2754 inode_only = LOG_INODE_EXISTS;
2755
2756 dentry = dentry->d_parent;
2757 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2758 break;
2759
2760 if (BTRFS_I(dentry->d_inode)->generation <=
2761 root->fs_info->last_trans_committed)
2762 break;
2763 }
2764 end_log_trans(root);
2765 return 0;
2766}
2767
2768/*
2769 * it is not safe to log dentry if the chunk root has added new
2770 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2771 * If this returns 1, you must commit the transaction to safely get your
2772 * data on disk.
2773 */
2774int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2775 struct btrfs_root *root, struct dentry *dentry)
2776{
2777 u64 gen;
2778 gen = root->fs_info->last_trans_new_blockgroup;
2779 if (gen > root->fs_info->last_trans_committed)
2780 return 1;
2781 else
2782 return btrfs_log_dentry(trans, root, dentry);
2783}
2784
2785/*
2786 * should be called during mount to recover any replay any log trees
2787 * from the FS
2788 */
2789int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2790{
2791 int ret;
2792 struct btrfs_path *path;
2793 struct btrfs_trans_handle *trans;
2794 struct btrfs_key key;
2795 struct btrfs_key found_key;
2796 struct btrfs_key tmp_key;
2797 struct btrfs_root *log;
2798 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2799 u64 highest_inode;
2800 struct walk_control wc = {
2801 .process_func = process_one_buffer,
2802 .stage = 0,
2803 };
2804
2805 fs_info->log_root_recovering = 1;
2806 path = btrfs_alloc_path();
2807 BUG_ON(!path);
2808
2809 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2810
2811 wc.trans = trans;
2812 wc.pin = 1;
2813
2814 walk_log_tree(trans, log_root_tree, &wc);
2815
2816again:
2817 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2818 key.offset = (u64)-1;
2819 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2820
2821 while (1) {
2822 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2823 if (ret < 0)
2824 break;
2825 if (ret > 0) {
2826 if (path->slots[0] == 0)
2827 break;
2828 path->slots[0]--;
2829 }
2830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2831 path->slots[0]);
2832 btrfs_release_path(log_root_tree, path);
2833 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2834 break;
2835
2836 log = btrfs_read_fs_root_no_radix(log_root_tree,
2837 &found_key);
2838 BUG_ON(!log);
2839
2840
2841 tmp_key.objectid = found_key.offset;
2842 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2843 tmp_key.offset = (u64)-1;
2844
2845 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2846 BUG_ON(!wc.replay_dest);
2847
2848 wc.replay_dest->log_root = log;
2849 btrfs_record_root_in_trans(wc.replay_dest);
2850 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret);
2852
2853 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2854 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2855 path);
2856 BUG_ON(ret);
2857 }
2858 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2859 if (ret == 0) {
2860 wc.replay_dest->highest_inode = highest_inode;
2861 wc.replay_dest->last_inode_alloc = highest_inode;
2862 }
2863
2864 key.offset = found_key.offset - 1;
2865 wc.replay_dest->log_root = NULL;
2866 free_extent_buffer(log->node);
2867 kfree(log);
2868
2869 if (found_key.offset == 0)
2870 break;
2871 }
2872 btrfs_release_path(log_root_tree, path);
2873
2874 /* step one is to pin it all, step two is to replay just inodes */
2875 if (wc.pin) {
2876 wc.pin = 0;
2877 wc.process_func = replay_one_buffer;
2878 wc.stage = LOG_WALK_REPLAY_INODES;
2879 goto again;
2880 }
2881 /* step three is to replay everything */
2882 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2883 wc.stage++;
2884 goto again;
2885 }
2886
2887 btrfs_free_path(path);
2888
2889 free_extent_buffer(log_root_tree->node);
2890 log_root_tree->log_root = NULL;
2891 fs_info->log_root_recovering = 0;
2892
2893 /* step 4: commit the transaction, which also unpins the blocks */
2894 btrfs_commit_transaction(trans, fs_info->tree_root);
2895
2896 kfree(log_root_tree);
2897 return 0;
2898}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which git &> /dev/null
12if [ $? == 0 ]; then
13 git branch >& /dev/null
14 if [ $? == 0 ]; then
15 if head=`git rev-parse --verify HEAD 2>/dev/null`; then
16 if tag=`git describe --tags 2>/dev/null`; then
17 v="$tag"
18 fi
19
20 # Are there uncommitted changes?
21 git update-index --refresh --unmerged > /dev/null
22 if git diff-index --name-only HEAD | \
23 grep -v "^scripts/package" \
24 | read dummy; then
25 v="$v"-dirty
26 fi
27 fi
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..b187b537888e
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h>
25#include "compat.h"
26#include "ctree.h"
27#include "extent_map.h"
28#include "disk-io.h"
29#include "transaction.h"
30#include "print-tree.h"
31#include "volumes.h"
32#include "async-thread.h"
33
34struct map_lookup {
35 u64 type;
36 int io_align;
37 int io_width;
38 int stripe_len;
39 int sector_size;
40 int num_stripes;
41 int sub_stripes;
42 struct btrfs_bio_stripe stripes[];
43};
44
45static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root,
47 struct btrfs_device *device);
48static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50#define map_lookup_size(n) (sizeof(struct map_lookup) + \
51 (sizeof(struct btrfs_bio_stripe) * (n)))
52
53static DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids);
55
56void btrfs_lock_volumes(void)
57{
58 mutex_lock(&uuid_mutex);
59}
60
61void btrfs_unlock_volumes(void)
62{
63 mutex_unlock(&uuid_mutex);
64}
65
66static void lock_chunks(struct btrfs_root *root)
67{
68 mutex_lock(&root->fs_info->chunk_mutex);
69}
70
71static void unlock_chunks(struct btrfs_root *root)
72{
73 mutex_unlock(&root->fs_info->chunk_mutex);
74}
75
76static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
77{
78 struct btrfs_device *device;
79 WARN_ON(fs_devices->opened);
80 while (!list_empty(&fs_devices->devices)) {
81 device = list_entry(fs_devices->devices.next,
82 struct btrfs_device, dev_list);
83 list_del(&device->dev_list);
84 kfree(device->name);
85 kfree(device);
86 }
87 kfree(fs_devices);
88}
89
90int btrfs_cleanup_fs_uuids(void)
91{
92 struct btrfs_fs_devices *fs_devices;
93
94 while (!list_empty(&fs_uuids)) {
95 fs_devices = list_entry(fs_uuids.next,
96 struct btrfs_fs_devices, list);
97 list_del(&fs_devices->list);
98 free_fs_devices(fs_devices);
99 }
100 return 0;
101}
102
103static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid)
105{
106 struct btrfs_device *dev;
107 struct list_head *cur;
108
109 list_for_each(cur, head) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev;
114 }
115 }
116 return NULL;
117}
118
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices;
123
124 list_for_each(cur, &fs_uuids) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices;
128 }
129 return NULL;
130}
131
132/*
133 * we try to collect pending bios for a device so we don't get a large
134 * number of procs sending bios down to the same device. This greatly
135 * improves the schedulers ability to collect and merge the bios.
136 *
137 * But, it also turns into a long list of bios to process and that is sure
138 * to eventually make the worker thread block. The solution here is to
139 * make some progress and then put this work struct back at the end of
140 * the list if the block device is congested. This way, multiple devices
141 * can make progress from a single worker thread.
142 */
143static noinline int run_scheduled_bios(struct btrfs_device *device)
144{
145 struct bio *pending;
146 struct backing_dev_info *bdi;
147 struct btrfs_fs_info *fs_info;
148 struct bio *tail;
149 struct bio *cur;
150 int again = 0;
151 unsigned long num_run = 0;
152 unsigned long limit;
153
154 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
155 fs_info = device->dev_root->fs_info;
156 limit = btrfs_async_submit_limit(fs_info);
157 limit = limit * 2 / 3;
158
159loop:
160 spin_lock(&device->io_lock);
161
162 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted
165 * into the list if we hit congestion
166 */
167 pending = device->pending_bios;
168 tail = device->pending_bio_tail;
169 WARN_ON(pending && !tail);
170 device->pending_bios = NULL;
171 device->pending_bio_tail = NULL;
172
173 /*
174 * if pending was null this time around, no bios need processing
175 * at all and we can stop. Otherwise it'll loop back up again
176 * and do an additional check so no bios are missed.
177 *
178 * device->running_pending is used to synchronize with the
179 * schedule_bio code.
180 */
181 if (pending) {
182 again = 1;
183 device->running_pending = 1;
184 } else {
185 again = 0;
186 device->running_pending = 0;
187 }
188 spin_unlock(&device->io_lock);
189
190 while (pending) {
191 cur = pending;
192 pending = pending->bi_next;
193 cur->bi_next = NULL;
194 atomic_dec(&fs_info->nr_async_bios);
195
196 if (atomic_read(&fs_info->nr_async_bios) < limit &&
197 waitqueue_active(&fs_info->async_submit_wait))
198 wake_up(&fs_info->async_submit_wait);
199
200 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
201 bio_get(cur);
202 submit_bio(cur->bi_rw, cur);
203 bio_put(cur);
204 num_run++;
205
206 /*
207 * we made progress, there is more work to do and the bdi
208 * is now congested. Back off and let other work structs
209 * run instead
210 */
211 if (pending && bdi_write_congested(bdi) &&
212 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head;
214
215 spin_lock(&device->io_lock);
216
217 old_head = device->pending_bios;
218 device->pending_bios = pending;
219 if (device->pending_bio_tail)
220 tail->bi_next = old_head;
221 else
222 device->pending_bio_tail = tail;
223
224 spin_unlock(&device->io_lock);
225 btrfs_requeue_work(&device->work);
226 goto done;
227 }
228 }
229 if (again)
230 goto loop;
231done:
232 return 0;
233}
234
235static void pending_bios_fn(struct btrfs_work *work)
236{
237 struct btrfs_device *device;
238
239 device = container_of(work, struct btrfs_device, work);
240 run_scheduled_bios(device);
241}
242
243static noinline int device_list_add(const char *path,
244 struct btrfs_super_block *disk_super,
245 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
246{
247 struct btrfs_device *device;
248 struct btrfs_fs_devices *fs_devices;
249 u64 found_transid = btrfs_super_generation(disk_super);
250
251 fs_devices = find_fsid(disk_super->fsid);
252 if (!fs_devices) {
253 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
254 if (!fs_devices)
255 return -ENOMEM;
256 INIT_LIST_HEAD(&fs_devices->devices);
257 INIT_LIST_HEAD(&fs_devices->alloc_list);
258 list_add(&fs_devices->list, &fs_uuids);
259 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
260 fs_devices->latest_devid = devid;
261 fs_devices->latest_trans = found_transid;
262 device = NULL;
263 } else {
264 device = __find_device(&fs_devices->devices, devid,
265 disk_super->dev_item.uuid);
266 }
267 if (!device) {
268 if (fs_devices->opened)
269 return -EBUSY;
270
271 device = kzalloc(sizeof(*device), GFP_NOFS);
272 if (!device) {
273 /* we can safely leave the fs_devices entry around */
274 return -ENOMEM;
275 }
276 device->devid = devid;
277 device->work.func = pending_bios_fn;
278 memcpy(device->uuid, disk_super->dev_item.uuid,
279 BTRFS_UUID_SIZE);
280 device->barriers = 1;
281 spin_lock_init(&device->io_lock);
282 device->name = kstrdup(path, GFP_NOFS);
283 if (!device->name) {
284 kfree(device);
285 return -ENOMEM;
286 }
287 INIT_LIST_HEAD(&device->dev_alloc_list);
288 list_add(&device->dev_list, &fs_devices->devices);
289 device->fs_devices = fs_devices;
290 fs_devices->num_devices++;
291 }
292
293 if (found_transid > fs_devices->latest_trans) {
294 fs_devices->latest_devid = devid;
295 fs_devices->latest_trans = found_transid;
296 }
297 *fs_devices_ret = fs_devices;
298 return 0;
299}
300
301static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
302{
303 struct btrfs_fs_devices *fs_devices;
304 struct btrfs_device *device;
305 struct btrfs_device *orig_dev;
306
307 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
308 if (!fs_devices)
309 return ERR_PTR(-ENOMEM);
310
311 INIT_LIST_HEAD(&fs_devices->devices);
312 INIT_LIST_HEAD(&fs_devices->alloc_list);
313 INIT_LIST_HEAD(&fs_devices->list);
314 fs_devices->latest_devid = orig->latest_devid;
315 fs_devices->latest_trans = orig->latest_trans;
316 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
317
318 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
319 device = kzalloc(sizeof(*device), GFP_NOFS);
320 if (!device)
321 goto error;
322
323 device->name = kstrdup(orig_dev->name, GFP_NOFS);
324 if (!device->name)
325 goto error;
326
327 device->devid = orig_dev->devid;
328 device->work.func = pending_bios_fn;
329 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
330 device->barriers = 1;
331 spin_lock_init(&device->io_lock);
332 INIT_LIST_HEAD(&device->dev_list);
333 INIT_LIST_HEAD(&device->dev_alloc_list);
334
335 list_add(&device->dev_list, &fs_devices->devices);
336 device->fs_devices = fs_devices;
337 fs_devices->num_devices++;
338 }
339 return fs_devices;
340error:
341 free_fs_devices(fs_devices);
342 return ERR_PTR(-ENOMEM);
343}
344
345int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
346{
347 struct list_head *tmp;
348 struct list_head *cur;
349 struct btrfs_device *device;
350
351 mutex_lock(&uuid_mutex);
352again:
353 list_for_each_safe(cur, tmp, &fs_devices->devices) {
354 device = list_entry(cur, struct btrfs_device, dev_list);
355 if (device->in_fs_metadata)
356 continue;
357
358 if (device->bdev) {
359 close_bdev_exclusive(device->bdev, device->mode);
360 device->bdev = NULL;
361 fs_devices->open_devices--;
362 }
363 if (device->writeable) {
364 list_del_init(&device->dev_alloc_list);
365 device->writeable = 0;
366 fs_devices->rw_devices--;
367 }
368 list_del_init(&device->dev_list);
369 fs_devices->num_devices--;
370 kfree(device->name);
371 kfree(device);
372 }
373
374 if (fs_devices->seed) {
375 fs_devices = fs_devices->seed;
376 goto again;
377 }
378
379 mutex_unlock(&uuid_mutex);
380 return 0;
381}
382
383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
384{
385 struct list_head *cur;
386 struct btrfs_device *device;
387
388 if (--fs_devices->opened > 0)
389 return 0;
390
391 list_for_each(cur, &fs_devices->devices) {
392 device = list_entry(cur, struct btrfs_device, dev_list);
393 if (device->bdev) {
394 close_bdev_exclusive(device->bdev, device->mode);
395 fs_devices->open_devices--;
396 }
397 if (device->writeable) {
398 list_del_init(&device->dev_alloc_list);
399 fs_devices->rw_devices--;
400 }
401
402 device->bdev = NULL;
403 device->writeable = 0;
404 device->in_fs_metadata = 0;
405 }
406 WARN_ON(fs_devices->open_devices);
407 WARN_ON(fs_devices->rw_devices);
408 fs_devices->opened = 0;
409 fs_devices->seeding = 0;
410
411 return 0;
412}
413
414int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
415{
416 struct btrfs_fs_devices *seed_devices = NULL;
417 int ret;
418
419 mutex_lock(&uuid_mutex);
420 ret = __btrfs_close_devices(fs_devices);
421 if (!fs_devices->opened) {
422 seed_devices = fs_devices->seed;
423 fs_devices->seed = NULL;
424 }
425 mutex_unlock(&uuid_mutex);
426
427 while (seed_devices) {
428 fs_devices = seed_devices;
429 seed_devices = fs_devices->seed;
430 __btrfs_close_devices(fs_devices);
431 free_fs_devices(fs_devices);
432 }
433 return ret;
434}
435
436static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
437 fmode_t flags, void *holder)
438{
439 struct block_device *bdev;
440 struct list_head *head = &fs_devices->devices;
441 struct list_head *cur;
442 struct btrfs_device *device;
443 struct block_device *latest_bdev = NULL;
444 struct buffer_head *bh;
445 struct btrfs_super_block *disk_super;
446 u64 latest_devid = 0;
447 u64 latest_transid = 0;
448 u64 devid;
449 int seeding = 1;
450 int ret = 0;
451
452 list_for_each(cur, head) {
453 device = list_entry(cur, struct btrfs_device, dev_list);
454 if (device->bdev)
455 continue;
456 if (!device->name)
457 continue;
458
459 bdev = open_bdev_exclusive(device->name, flags, holder);
460 if (IS_ERR(bdev)) {
461 printk(KERN_INFO "open %s failed\n", device->name);
462 goto error;
463 }
464 set_blocksize(bdev, 4096);
465
466 bh = btrfs_read_dev_super(bdev);
467 if (!bh)
468 goto error_close;
469
470 disk_super = (struct btrfs_super_block *)bh->b_data;
471 devid = le64_to_cpu(disk_super->dev_item.devid);
472 if (devid != device->devid)
473 goto error_brelse;
474
475 if (memcmp(device->uuid, disk_super->dev_item.uuid,
476 BTRFS_UUID_SIZE))
477 goto error_brelse;
478
479 device->generation = btrfs_super_generation(disk_super);
480 if (!latest_transid || device->generation > latest_transid) {
481 latest_devid = devid;
482 latest_transid = device->generation;
483 latest_bdev = bdev;
484 }
485
486 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
487 device->writeable = 0;
488 } else {
489 device->writeable = !bdev_read_only(bdev);
490 seeding = 0;
491 }
492
493 device->bdev = bdev;
494 device->in_fs_metadata = 0;
495 device->mode = flags;
496
497 fs_devices->open_devices++;
498 if (device->writeable) {
499 fs_devices->rw_devices++;
500 list_add(&device->dev_alloc_list,
501 &fs_devices->alloc_list);
502 }
503 continue;
504
505error_brelse:
506 brelse(bh);
507error_close:
508 close_bdev_exclusive(bdev, FMODE_READ);
509error:
510 continue;
511 }
512 if (fs_devices->open_devices == 0) {
513 ret = -EIO;
514 goto out;
515 }
516 fs_devices->seeding = seeding;
517 fs_devices->opened = 1;
518 fs_devices->latest_bdev = latest_bdev;
519 fs_devices->latest_devid = latest_devid;
520 fs_devices->latest_trans = latest_transid;
521 fs_devices->total_rw_bytes = 0;
522out:
523 return ret;
524}
525
526int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
527 fmode_t flags, void *holder)
528{
529 int ret;
530
531 mutex_lock(&uuid_mutex);
532 if (fs_devices->opened) {
533 fs_devices->opened++;
534 ret = 0;
535 } else {
536 ret = __btrfs_open_devices(fs_devices, flags, holder);
537 }
538 mutex_unlock(&uuid_mutex);
539 return ret;
540}
541
542int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
543 struct btrfs_fs_devices **fs_devices_ret)
544{
545 struct btrfs_super_block *disk_super;
546 struct block_device *bdev;
547 struct buffer_head *bh;
548 int ret;
549 u64 devid;
550 u64 transid;
551
552 mutex_lock(&uuid_mutex);
553
554 bdev = open_bdev_exclusive(path, flags, holder);
555
556 if (IS_ERR(bdev)) {
557 ret = PTR_ERR(bdev);
558 goto error;
559 }
560
561 ret = set_blocksize(bdev, 4096);
562 if (ret)
563 goto error_close;
564 bh = btrfs_read_dev_super(bdev);
565 if (!bh) {
566 ret = -EIO;
567 goto error_close;
568 }
569 disk_super = (struct btrfs_super_block *)bh->b_data;
570 devid = le64_to_cpu(disk_super->dev_item.devid);
571 transid = btrfs_super_generation(disk_super);
572 if (disk_super->label[0])
573 printk(KERN_INFO "device label %s ", disk_super->label);
574 else {
575 /* FIXME, make a readl uuid parser */
576 printk(KERN_INFO "device fsid %llx-%llx ",
577 *(unsigned long long *)disk_super->fsid,
578 *(unsigned long long *)(disk_super->fsid + 8));
579 }
580 printk(KERN_INFO "devid %llu transid %llu %s\n",
581 (unsigned long long)devid, (unsigned long long)transid, path);
582 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
583
584 brelse(bh);
585error_close:
586 close_bdev_exclusive(bdev, flags);
587error:
588 mutex_unlock(&uuid_mutex);
589 return ret;
590}
591
592/*
593 * this uses a pretty simple search, the expectation is that it is
594 * called very infrequently and that a given device has a small number
595 * of extents
596 */
597static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
598 struct btrfs_device *device,
599 u64 num_bytes, u64 *start)
600{
601 struct btrfs_key key;
602 struct btrfs_root *root = device->dev_root;
603 struct btrfs_dev_extent *dev_extent = NULL;
604 struct btrfs_path *path;
605 u64 hole_size = 0;
606 u64 last_byte = 0;
607 u64 search_start = 0;
608 u64 search_end = device->total_bytes;
609 int ret;
610 int slot = 0;
611 int start_found;
612 struct extent_buffer *l;
613
614 path = btrfs_alloc_path();
615 if (!path)
616 return -ENOMEM;
617 path->reada = 2;
618 start_found = 0;
619
620 /* FIXME use last free of some kind */
621
622 /* we don't want to overwrite the superblock on the drive,
623 * so we make sure to start at an offset of at least 1MB
624 */
625 search_start = max((u64)1024 * 1024, search_start);
626
627 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
628 search_start = max(root->fs_info->alloc_start, search_start);
629
630 key.objectid = device->devid;
631 key.offset = search_start;
632 key.type = BTRFS_DEV_EXTENT_KEY;
633 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
634 if (ret < 0)
635 goto error;
636 ret = btrfs_previous_item(root, path, 0, key.type);
637 if (ret < 0)
638 goto error;
639 l = path->nodes[0];
640 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
641 while (1) {
642 l = path->nodes[0];
643 slot = path->slots[0];
644 if (slot >= btrfs_header_nritems(l)) {
645 ret = btrfs_next_leaf(root, path);
646 if (ret == 0)
647 continue;
648 if (ret < 0)
649 goto error;
650no_more_items:
651 if (!start_found) {
652 if (search_start >= search_end) {
653 ret = -ENOSPC;
654 goto error;
655 }
656 *start = search_start;
657 start_found = 1;
658 goto check_pending;
659 }
660 *start = last_byte > search_start ?
661 last_byte : search_start;
662 if (search_end <= *start) {
663 ret = -ENOSPC;
664 goto error;
665 }
666 goto check_pending;
667 }
668 btrfs_item_key_to_cpu(l, &key, slot);
669
670 if (key.objectid < device->devid)
671 goto next;
672
673 if (key.objectid > device->devid)
674 goto no_more_items;
675
676 if (key.offset >= search_start && key.offset > last_byte &&
677 start_found) {
678 if (last_byte < search_start)
679 last_byte = search_start;
680 hole_size = key.offset - last_byte;
681 if (key.offset > last_byte &&
682 hole_size >= num_bytes) {
683 *start = last_byte;
684 goto check_pending;
685 }
686 }
687 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
688 goto next;
689
690 start_found = 1;
691 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
692 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
693next:
694 path->slots[0]++;
695 cond_resched();
696 }
697check_pending:
698 /* we have to make sure we didn't find an extent that has already
699 * been allocated by the map tree or the original allocation
700 */
701 BUG_ON(*start < search_start);
702
703 if (*start + num_bytes > search_end) {
704 ret = -ENOSPC;
705 goto error;
706 }
707 /* check for pending inserts here */
708 ret = 0;
709
710error:
711 btrfs_free_path(path);
712 return ret;
713}
714
715static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
716 struct btrfs_device *device,
717 u64 start)
718{
719 int ret;
720 struct btrfs_path *path;
721 struct btrfs_root *root = device->dev_root;
722 struct btrfs_key key;
723 struct btrfs_key found_key;
724 struct extent_buffer *leaf = NULL;
725 struct btrfs_dev_extent *extent = NULL;
726
727 path = btrfs_alloc_path();
728 if (!path)
729 return -ENOMEM;
730
731 key.objectid = device->devid;
732 key.offset = start;
733 key.type = BTRFS_DEV_EXTENT_KEY;
734
735 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
736 if (ret > 0) {
737 ret = btrfs_previous_item(root, path, key.objectid,
738 BTRFS_DEV_EXTENT_KEY);
739 BUG_ON(ret);
740 leaf = path->nodes[0];
741 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
742 extent = btrfs_item_ptr(leaf, path->slots[0],
743 struct btrfs_dev_extent);
744 BUG_ON(found_key.offset > start || found_key.offset +
745 btrfs_dev_extent_length(leaf, extent) < start);
746 ret = 0;
747 } else if (ret == 0) {
748 leaf = path->nodes[0];
749 extent = btrfs_item_ptr(leaf, path->slots[0],
750 struct btrfs_dev_extent);
751 }
752 BUG_ON(ret);
753
754 if (device->bytes_used > 0)
755 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
756 ret = btrfs_del_item(trans, root, path);
757 BUG_ON(ret);
758
759 btrfs_free_path(path);
760 return ret;
761}
762
763int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
764 struct btrfs_device *device,
765 u64 chunk_tree, u64 chunk_objectid,
766 u64 chunk_offset, u64 start, u64 num_bytes)
767{
768 int ret;
769 struct btrfs_path *path;
770 struct btrfs_root *root = device->dev_root;
771 struct btrfs_dev_extent *extent;
772 struct extent_buffer *leaf;
773 struct btrfs_key key;
774
775 WARN_ON(!device->in_fs_metadata);
776 path = btrfs_alloc_path();
777 if (!path)
778 return -ENOMEM;
779
780 key.objectid = device->devid;
781 key.offset = start;
782 key.type = BTRFS_DEV_EXTENT_KEY;
783 ret = btrfs_insert_empty_item(trans, root, path, &key,
784 sizeof(*extent));
785 BUG_ON(ret);
786
787 leaf = path->nodes[0];
788 extent = btrfs_item_ptr(leaf, path->slots[0],
789 struct btrfs_dev_extent);
790 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
791 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
792 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
793
794 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
795 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
796 BTRFS_UUID_SIZE);
797
798 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
799 btrfs_mark_buffer_dirty(leaf);
800 btrfs_free_path(path);
801 return ret;
802}
803
804static noinline int find_next_chunk(struct btrfs_root *root,
805 u64 objectid, u64 *offset)
806{
807 struct btrfs_path *path;
808 int ret;
809 struct btrfs_key key;
810 struct btrfs_chunk *chunk;
811 struct btrfs_key found_key;
812
813 path = btrfs_alloc_path();
814 BUG_ON(!path);
815
816 key.objectid = objectid;
817 key.offset = (u64)-1;
818 key.type = BTRFS_CHUNK_ITEM_KEY;
819
820 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
821 if (ret < 0)
822 goto error;
823
824 BUG_ON(ret == 0);
825
826 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
827 if (ret) {
828 *offset = 0;
829 } else {
830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
831 path->slots[0]);
832 if (found_key.objectid != objectid)
833 *offset = 0;
834 else {
835 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
836 struct btrfs_chunk);
837 *offset = found_key.offset +
838 btrfs_chunk_length(path->nodes[0], chunk);
839 }
840 }
841 ret = 0;
842error:
843 btrfs_free_path(path);
844 return ret;
845}
846
847static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
848{
849 int ret;
850 struct btrfs_key key;
851 struct btrfs_key found_key;
852 struct btrfs_path *path;
853
854 root = root->fs_info->chunk_root;
855
856 path = btrfs_alloc_path();
857 if (!path)
858 return -ENOMEM;
859
860 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
861 key.type = BTRFS_DEV_ITEM_KEY;
862 key.offset = (u64)-1;
863
864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
865 if (ret < 0)
866 goto error;
867
868 BUG_ON(ret == 0);
869
870 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
871 BTRFS_DEV_ITEM_KEY);
872 if (ret) {
873 *objectid = 1;
874 } else {
875 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
876 path->slots[0]);
877 *objectid = found_key.offset + 1;
878 }
879 ret = 0;
880error:
881 btrfs_free_path(path);
882 return ret;
883}
884
885/*
886 * the device information is stored in the chunk root
887 * the btrfs_device struct should be fully filled in
888 */
889int btrfs_add_device(struct btrfs_trans_handle *trans,
890 struct btrfs_root *root,
891 struct btrfs_device *device)
892{
893 int ret;
894 struct btrfs_path *path;
895 struct btrfs_dev_item *dev_item;
896 struct extent_buffer *leaf;
897 struct btrfs_key key;
898 unsigned long ptr;
899
900 root = root->fs_info->chunk_root;
901
902 path = btrfs_alloc_path();
903 if (!path)
904 return -ENOMEM;
905
906 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
907 key.type = BTRFS_DEV_ITEM_KEY;
908 key.offset = device->devid;
909
910 ret = btrfs_insert_empty_item(trans, root, path, &key,
911 sizeof(*dev_item));
912 if (ret)
913 goto out;
914
915 leaf = path->nodes[0];
916 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
917
918 btrfs_set_device_id(leaf, dev_item, device->devid);
919 btrfs_set_device_generation(leaf, dev_item, 0);
920 btrfs_set_device_type(leaf, dev_item, device->type);
921 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
922 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
923 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
924 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
925 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
926 btrfs_set_device_group(leaf, dev_item, 0);
927 btrfs_set_device_seek_speed(leaf, dev_item, 0);
928 btrfs_set_device_bandwidth(leaf, dev_item, 0);
929 btrfs_set_device_start_offset(leaf, dev_item, 0);
930
931 ptr = (unsigned long)btrfs_device_uuid(dev_item);
932 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
933 ptr = (unsigned long)btrfs_device_fsid(dev_item);
934 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
935 btrfs_mark_buffer_dirty(leaf);
936
937 ret = 0;
938out:
939 btrfs_free_path(path);
940 return ret;
941}
942
943static int btrfs_rm_dev_item(struct btrfs_root *root,
944 struct btrfs_device *device)
945{
946 int ret;
947 struct btrfs_path *path;
948 struct btrfs_key key;
949 struct btrfs_trans_handle *trans;
950
951 root = root->fs_info->chunk_root;
952
953 path = btrfs_alloc_path();
954 if (!path)
955 return -ENOMEM;
956
957 trans = btrfs_start_transaction(root, 1);
958 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
959 key.type = BTRFS_DEV_ITEM_KEY;
960 key.offset = device->devid;
961 lock_chunks(root);
962
963 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
964 if (ret < 0)
965 goto out;
966
967 if (ret > 0) {
968 ret = -ENOENT;
969 goto out;
970 }
971
972 ret = btrfs_del_item(trans, root, path);
973 if (ret)
974 goto out;
975out:
976 btrfs_free_path(path);
977 unlock_chunks(root);
978 btrfs_commit_transaction(trans, root);
979 return ret;
980}
981
982int btrfs_rm_device(struct btrfs_root *root, char *device_path)
983{
984 struct btrfs_device *device;
985 struct btrfs_device *next_device;
986 struct block_device *bdev;
987 struct buffer_head *bh = NULL;
988 struct btrfs_super_block *disk_super;
989 u64 all_avail;
990 u64 devid;
991 u64 num_devices;
992 u8 *dev_uuid;
993 int ret = 0;
994
995 mutex_lock(&uuid_mutex);
996 mutex_lock(&root->fs_info->volume_mutex);
997
998 all_avail = root->fs_info->avail_data_alloc_bits |
999 root->fs_info->avail_system_alloc_bits |
1000 root->fs_info->avail_metadata_alloc_bits;
1001
1002 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1003 root->fs_info->fs_devices->rw_devices <= 4) {
1004 printk(KERN_ERR "btrfs: unable to go below four devices "
1005 "on raid10\n");
1006 ret = -EINVAL;
1007 goto out;
1008 }
1009
1010 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1011 root->fs_info->fs_devices->rw_devices <= 2) {
1012 printk(KERN_ERR "btrfs: unable to go below two "
1013 "devices on raid1\n");
1014 ret = -EINVAL;
1015 goto out;
1016 }
1017
1018 if (strcmp(device_path, "missing") == 0) {
1019 struct list_head *cur;
1020 struct list_head *devices;
1021 struct btrfs_device *tmp;
1022
1023 device = NULL;
1024 devices = &root->fs_info->fs_devices->devices;
1025 list_for_each(cur, devices) {
1026 tmp = list_entry(cur, struct btrfs_device, dev_list);
1027 if (tmp->in_fs_metadata && !tmp->bdev) {
1028 device = tmp;
1029 break;
1030 }
1031 }
1032 bdev = NULL;
1033 bh = NULL;
1034 disk_super = NULL;
1035 if (!device) {
1036 printk(KERN_ERR "btrfs: no missing devices found to "
1037 "remove\n");
1038 goto out;
1039 }
1040 } else {
1041 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1042 root->fs_info->bdev_holder);
1043 if (IS_ERR(bdev)) {
1044 ret = PTR_ERR(bdev);
1045 goto out;
1046 }
1047
1048 set_blocksize(bdev, 4096);
1049 bh = btrfs_read_dev_super(bdev);
1050 if (!bh) {
1051 ret = -EIO;
1052 goto error_close;
1053 }
1054 disk_super = (struct btrfs_super_block *)bh->b_data;
1055 devid = le64_to_cpu(disk_super->dev_item.devid);
1056 dev_uuid = disk_super->dev_item.uuid;
1057 device = btrfs_find_device(root, devid, dev_uuid,
1058 disk_super->fsid);
1059 if (!device) {
1060 ret = -ENOENT;
1061 goto error_brelse;
1062 }
1063 }
1064
1065 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1066 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1067 "device\n");
1068 ret = -EINVAL;
1069 goto error_brelse;
1070 }
1071
1072 if (device->writeable) {
1073 list_del_init(&device->dev_alloc_list);
1074 root->fs_info->fs_devices->rw_devices--;
1075 }
1076
1077 ret = btrfs_shrink_device(device, 0);
1078 if (ret)
1079 goto error_brelse;
1080
1081 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1082 if (ret)
1083 goto error_brelse;
1084
1085 device->in_fs_metadata = 0;
1086 list_del_init(&device->dev_list);
1087 device->fs_devices->num_devices--;
1088
1089 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1090 struct btrfs_device, dev_list);
1091 if (device->bdev == root->fs_info->sb->s_bdev)
1092 root->fs_info->sb->s_bdev = next_device->bdev;
1093 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1094 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1095
1096 if (device->bdev) {
1097 close_bdev_exclusive(device->bdev, device->mode);
1098 device->bdev = NULL;
1099 device->fs_devices->open_devices--;
1100 }
1101
1102 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1103 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1104
1105 if (device->fs_devices->open_devices == 0) {
1106 struct btrfs_fs_devices *fs_devices;
1107 fs_devices = root->fs_info->fs_devices;
1108 while (fs_devices) {
1109 if (fs_devices->seed == device->fs_devices)
1110 break;
1111 fs_devices = fs_devices->seed;
1112 }
1113 fs_devices->seed = device->fs_devices->seed;
1114 device->fs_devices->seed = NULL;
1115 __btrfs_close_devices(device->fs_devices);
1116 free_fs_devices(device->fs_devices);
1117 }
1118
1119 /*
1120 * at this point, the device is zero sized. We want to
1121 * remove it from the devices list and zero out the old super
1122 */
1123 if (device->writeable) {
1124 /* make sure this device isn't detected as part of
1125 * the FS anymore
1126 */
1127 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1128 set_buffer_dirty(bh);
1129 sync_dirty_buffer(bh);
1130 }
1131
1132 kfree(device->name);
1133 kfree(device);
1134 ret = 0;
1135
1136error_brelse:
1137 brelse(bh);
1138error_close:
1139 if (bdev)
1140 close_bdev_exclusive(bdev, FMODE_READ);
1141out:
1142 mutex_unlock(&root->fs_info->volume_mutex);
1143 mutex_unlock(&uuid_mutex);
1144 return ret;
1145}
1146
1147/*
1148 * does all the dirty work required for changing file system's UUID.
1149 */
1150static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1151 struct btrfs_root *root)
1152{
1153 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1154 struct btrfs_fs_devices *old_devices;
1155 struct btrfs_fs_devices *seed_devices;
1156 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1157 struct btrfs_device *device;
1158 u64 super_flags;
1159
1160 BUG_ON(!mutex_is_locked(&uuid_mutex));
1161 if (!fs_devices->seeding)
1162 return -EINVAL;
1163
1164 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1165 if (!seed_devices)
1166 return -ENOMEM;
1167
1168 old_devices = clone_fs_devices(fs_devices);
1169 if (IS_ERR(old_devices)) {
1170 kfree(seed_devices);
1171 return PTR_ERR(old_devices);
1172 }
1173
1174 list_add(&old_devices->list, &fs_uuids);
1175
1176 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1177 seed_devices->opened = 1;
1178 INIT_LIST_HEAD(&seed_devices->devices);
1179 INIT_LIST_HEAD(&seed_devices->alloc_list);
1180 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1181 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1182 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1183 device->fs_devices = seed_devices;
1184 }
1185
1186 fs_devices->seeding = 0;
1187 fs_devices->num_devices = 0;
1188 fs_devices->open_devices = 0;
1189 fs_devices->seed = seed_devices;
1190
1191 generate_random_uuid(fs_devices->fsid);
1192 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1193 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1194 super_flags = btrfs_super_flags(disk_super) &
1195 ~BTRFS_SUPER_FLAG_SEEDING;
1196 btrfs_set_super_flags(disk_super, super_flags);
1197
1198 return 0;
1199}
1200
1201/*
1202 * strore the expected generation for seed devices in device items.
1203 */
1204static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1205 struct btrfs_root *root)
1206{
1207 struct btrfs_path *path;
1208 struct extent_buffer *leaf;
1209 struct btrfs_dev_item *dev_item;
1210 struct btrfs_device *device;
1211 struct btrfs_key key;
1212 u8 fs_uuid[BTRFS_UUID_SIZE];
1213 u8 dev_uuid[BTRFS_UUID_SIZE];
1214 u64 devid;
1215 int ret;
1216
1217 path = btrfs_alloc_path();
1218 if (!path)
1219 return -ENOMEM;
1220
1221 root = root->fs_info->chunk_root;
1222 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1223 key.offset = 0;
1224 key.type = BTRFS_DEV_ITEM_KEY;
1225
1226 while (1) {
1227 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1228 if (ret < 0)
1229 goto error;
1230
1231 leaf = path->nodes[0];
1232next_slot:
1233 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1234 ret = btrfs_next_leaf(root, path);
1235 if (ret > 0)
1236 break;
1237 if (ret < 0)
1238 goto error;
1239 leaf = path->nodes[0];
1240 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1241 btrfs_release_path(root, path);
1242 continue;
1243 }
1244
1245 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1246 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1247 key.type != BTRFS_DEV_ITEM_KEY)
1248 break;
1249
1250 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1251 struct btrfs_dev_item);
1252 devid = btrfs_device_id(leaf, dev_item);
1253 read_extent_buffer(leaf, dev_uuid,
1254 (unsigned long)btrfs_device_uuid(dev_item),
1255 BTRFS_UUID_SIZE);
1256 read_extent_buffer(leaf, fs_uuid,
1257 (unsigned long)btrfs_device_fsid(dev_item),
1258 BTRFS_UUID_SIZE);
1259 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1260 BUG_ON(!device);
1261
1262 if (device->fs_devices->seeding) {
1263 btrfs_set_device_generation(leaf, dev_item,
1264 device->generation);
1265 btrfs_mark_buffer_dirty(leaf);
1266 }
1267
1268 path->slots[0]++;
1269 goto next_slot;
1270 }
1271 ret = 0;
1272error:
1273 btrfs_free_path(path);
1274 return ret;
1275}
1276
1277int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1278{
1279 struct btrfs_trans_handle *trans;
1280 struct btrfs_device *device;
1281 struct block_device *bdev;
1282 struct list_head *cur;
1283 struct list_head *devices;
1284 struct super_block *sb = root->fs_info->sb;
1285 u64 total_bytes;
1286 int seeding_dev = 0;
1287 int ret = 0;
1288
1289 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1290 return -EINVAL;
1291
1292 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1293 if (!bdev)
1294 return -EIO;
1295
1296 if (root->fs_info->fs_devices->seeding) {
1297 seeding_dev = 1;
1298 down_write(&sb->s_umount);
1299 mutex_lock(&uuid_mutex);
1300 }
1301
1302 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1303 mutex_lock(&root->fs_info->volume_mutex);
1304
1305 devices = &root->fs_info->fs_devices->devices;
1306 list_for_each(cur, devices) {
1307 device = list_entry(cur, struct btrfs_device, dev_list);
1308 if (device->bdev == bdev) {
1309 ret = -EEXIST;
1310 goto error;
1311 }
1312 }
1313
1314 device = kzalloc(sizeof(*device), GFP_NOFS);
1315 if (!device) {
1316 /* we can safely leave the fs_devices entry around */
1317 ret = -ENOMEM;
1318 goto error;
1319 }
1320
1321 device->name = kstrdup(device_path, GFP_NOFS);
1322 if (!device->name) {
1323 kfree(device);
1324 ret = -ENOMEM;
1325 goto error;
1326 }
1327
1328 ret = find_next_devid(root, &device->devid);
1329 if (ret) {
1330 kfree(device);
1331 goto error;
1332 }
1333
1334 trans = btrfs_start_transaction(root, 1);
1335 lock_chunks(root);
1336
1337 device->barriers = 1;
1338 device->writeable = 1;
1339 device->work.func = pending_bios_fn;
1340 generate_random_uuid(device->uuid);
1341 spin_lock_init(&device->io_lock);
1342 device->generation = trans->transid;
1343 device->io_width = root->sectorsize;
1344 device->io_align = root->sectorsize;
1345 device->sector_size = root->sectorsize;
1346 device->total_bytes = i_size_read(bdev->bd_inode);
1347 device->dev_root = root->fs_info->dev_root;
1348 device->bdev = bdev;
1349 device->in_fs_metadata = 1;
1350 device->mode = 0;
1351 set_blocksize(device->bdev, 4096);
1352
1353 if (seeding_dev) {
1354 sb->s_flags &= ~MS_RDONLY;
1355 ret = btrfs_prepare_sprout(trans, root);
1356 BUG_ON(ret);
1357 }
1358
1359 device->fs_devices = root->fs_info->fs_devices;
1360 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1361 list_add(&device->dev_alloc_list,
1362 &root->fs_info->fs_devices->alloc_list);
1363 root->fs_info->fs_devices->num_devices++;
1364 root->fs_info->fs_devices->open_devices++;
1365 root->fs_info->fs_devices->rw_devices++;
1366 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1367
1368 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1369 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1370 total_bytes + device->total_bytes);
1371
1372 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1373 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1374 total_bytes + 1);
1375
1376 if (seeding_dev) {
1377 ret = init_first_rw_device(trans, root, device);
1378 BUG_ON(ret);
1379 ret = btrfs_finish_sprout(trans, root);
1380 BUG_ON(ret);
1381 } else {
1382 ret = btrfs_add_device(trans, root, device);
1383 }
1384
1385 unlock_chunks(root);
1386 btrfs_commit_transaction(trans, root);
1387
1388 if (seeding_dev) {
1389 mutex_unlock(&uuid_mutex);
1390 up_write(&sb->s_umount);
1391
1392 ret = btrfs_relocate_sys_chunks(root);
1393 BUG_ON(ret);
1394 }
1395out:
1396 mutex_unlock(&root->fs_info->volume_mutex);
1397 return ret;
1398error:
1399 close_bdev_exclusive(bdev, 0);
1400 if (seeding_dev) {
1401 mutex_unlock(&uuid_mutex);
1402 up_write(&sb->s_umount);
1403 }
1404 goto out;
1405}
1406
1407static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1408 struct btrfs_device *device)
1409{
1410 int ret;
1411 struct btrfs_path *path;
1412 struct btrfs_root *root;
1413 struct btrfs_dev_item *dev_item;
1414 struct extent_buffer *leaf;
1415 struct btrfs_key key;
1416
1417 root = device->dev_root->fs_info->chunk_root;
1418
1419 path = btrfs_alloc_path();
1420 if (!path)
1421 return -ENOMEM;
1422
1423 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1424 key.type = BTRFS_DEV_ITEM_KEY;
1425 key.offset = device->devid;
1426
1427 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1428 if (ret < 0)
1429 goto out;
1430
1431 if (ret > 0) {
1432 ret = -ENOENT;
1433 goto out;
1434 }
1435
1436 leaf = path->nodes[0];
1437 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1438
1439 btrfs_set_device_id(leaf, dev_item, device->devid);
1440 btrfs_set_device_type(leaf, dev_item, device->type);
1441 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1442 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1443 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1444 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1445 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1446 btrfs_mark_buffer_dirty(leaf);
1447
1448out:
1449 btrfs_free_path(path);
1450 return ret;
1451}
1452
1453static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1454 struct btrfs_device *device, u64 new_size)
1455{
1456 struct btrfs_super_block *super_copy =
1457 &device->dev_root->fs_info->super_copy;
1458 u64 old_total = btrfs_super_total_bytes(super_copy);
1459 u64 diff = new_size - device->total_bytes;
1460
1461 if (!device->writeable)
1462 return -EACCES;
1463 if (new_size <= device->total_bytes)
1464 return -EINVAL;
1465
1466 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1467 device->fs_devices->total_rw_bytes += diff;
1468
1469 device->total_bytes = new_size;
1470 return btrfs_update_device(trans, device);
1471}
1472
1473int btrfs_grow_device(struct btrfs_trans_handle *trans,
1474 struct btrfs_device *device, u64 new_size)
1475{
1476 int ret;
1477 lock_chunks(device->dev_root);
1478 ret = __btrfs_grow_device(trans, device, new_size);
1479 unlock_chunks(device->dev_root);
1480 return ret;
1481}
1482
1483static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1484 struct btrfs_root *root,
1485 u64 chunk_tree, u64 chunk_objectid,
1486 u64 chunk_offset)
1487{
1488 int ret;
1489 struct btrfs_path *path;
1490 struct btrfs_key key;
1491
1492 root = root->fs_info->chunk_root;
1493 path = btrfs_alloc_path();
1494 if (!path)
1495 return -ENOMEM;
1496
1497 key.objectid = chunk_objectid;
1498 key.offset = chunk_offset;
1499 key.type = BTRFS_CHUNK_ITEM_KEY;
1500
1501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1502 BUG_ON(ret);
1503
1504 ret = btrfs_del_item(trans, root, path);
1505 BUG_ON(ret);
1506
1507 btrfs_free_path(path);
1508 return 0;
1509}
1510
1511static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1512 chunk_offset)
1513{
1514 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1515 struct btrfs_disk_key *disk_key;
1516 struct btrfs_chunk *chunk;
1517 u8 *ptr;
1518 int ret = 0;
1519 u32 num_stripes;
1520 u32 array_size;
1521 u32 len = 0;
1522 u32 cur;
1523 struct btrfs_key key;
1524
1525 array_size = btrfs_super_sys_array_size(super_copy);
1526
1527 ptr = super_copy->sys_chunk_array;
1528 cur = 0;
1529
1530 while (cur < array_size) {
1531 disk_key = (struct btrfs_disk_key *)ptr;
1532 btrfs_disk_key_to_cpu(&key, disk_key);
1533
1534 len = sizeof(*disk_key);
1535
1536 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1537 chunk = (struct btrfs_chunk *)(ptr + len);
1538 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1539 len += btrfs_chunk_item_size(num_stripes);
1540 } else {
1541 ret = -EIO;
1542 break;
1543 }
1544 if (key.objectid == chunk_objectid &&
1545 key.offset == chunk_offset) {
1546 memmove(ptr, ptr + len, array_size - (cur + len));
1547 array_size -= len;
1548 btrfs_set_super_sys_array_size(super_copy, array_size);
1549 } else {
1550 ptr += len;
1551 cur += len;
1552 }
1553 }
1554 return ret;
1555}
1556
1557static int btrfs_relocate_chunk(struct btrfs_root *root,
1558 u64 chunk_tree, u64 chunk_objectid,
1559 u64 chunk_offset)
1560{
1561 struct extent_map_tree *em_tree;
1562 struct btrfs_root *extent_root;
1563 struct btrfs_trans_handle *trans;
1564 struct extent_map *em;
1565 struct map_lookup *map;
1566 int ret;
1567 int i;
1568
1569 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1570 (unsigned long long)chunk_offset);
1571 root = root->fs_info->chunk_root;
1572 extent_root = root->fs_info->extent_root;
1573 em_tree = &root->fs_info->mapping_tree.map_tree;
1574
1575 /* step one, relocate all the extents inside this chunk */
1576 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1577 BUG_ON(ret);
1578
1579 trans = btrfs_start_transaction(root, 1);
1580 BUG_ON(!trans);
1581
1582 lock_chunks(root);
1583
1584 /*
1585 * step two, delete the device extents and the
1586 * chunk tree entries
1587 */
1588 spin_lock(&em_tree->lock);
1589 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1590 spin_unlock(&em_tree->lock);
1591
1592 BUG_ON(em->start > chunk_offset ||
1593 em->start + em->len < chunk_offset);
1594 map = (struct map_lookup *)em->bdev;
1595
1596 for (i = 0; i < map->num_stripes; i++) {
1597 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1598 map->stripes[i].physical);
1599 BUG_ON(ret);
1600
1601 if (map->stripes[i].dev) {
1602 ret = btrfs_update_device(trans, map->stripes[i].dev);
1603 BUG_ON(ret);
1604 }
1605 }
1606 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1607 chunk_offset);
1608
1609 BUG_ON(ret);
1610
1611 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1612 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1613 BUG_ON(ret);
1614 }
1615
1616 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1617 BUG_ON(ret);
1618
1619 spin_lock(&em_tree->lock);
1620 remove_extent_mapping(em_tree, em);
1621 spin_unlock(&em_tree->lock);
1622
1623 kfree(map);
1624 em->bdev = NULL;
1625
1626 /* once for the tree */
1627 free_extent_map(em);
1628 /* once for us */
1629 free_extent_map(em);
1630
1631 unlock_chunks(root);
1632 btrfs_end_transaction(trans, root);
1633 return 0;
1634}
1635
1636static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1637{
1638 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1639 struct btrfs_path *path;
1640 struct extent_buffer *leaf;
1641 struct btrfs_chunk *chunk;
1642 struct btrfs_key key;
1643 struct btrfs_key found_key;
1644 u64 chunk_tree = chunk_root->root_key.objectid;
1645 u64 chunk_type;
1646 int ret;
1647
1648 path = btrfs_alloc_path();
1649 if (!path)
1650 return -ENOMEM;
1651
1652 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1653 key.offset = (u64)-1;
1654 key.type = BTRFS_CHUNK_ITEM_KEY;
1655
1656 while (1) {
1657 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1658 if (ret < 0)
1659 goto error;
1660 BUG_ON(ret == 0);
1661
1662 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1663 key.type);
1664 if (ret < 0)
1665 goto error;
1666 if (ret > 0)
1667 break;
1668
1669 leaf = path->nodes[0];
1670 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1671
1672 chunk = btrfs_item_ptr(leaf, path->slots[0],
1673 struct btrfs_chunk);
1674 chunk_type = btrfs_chunk_type(leaf, chunk);
1675 btrfs_release_path(chunk_root, path);
1676
1677 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1678 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1679 found_key.objectid,
1680 found_key.offset);
1681 BUG_ON(ret);
1682 }
1683
1684 if (found_key.offset == 0)
1685 break;
1686 key.offset = found_key.offset - 1;
1687 }
1688 ret = 0;
1689error:
1690 btrfs_free_path(path);
1691 return ret;
1692}
1693
1694static u64 div_factor(u64 num, int factor)
1695{
1696 if (factor == 10)
1697 return num;
1698 num *= factor;
1699 do_div(num, 10);
1700 return num;
1701}
1702
1703int btrfs_balance(struct btrfs_root *dev_root)
1704{
1705 int ret;
1706 struct list_head *cur;
1707 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1708 struct btrfs_device *device;
1709 u64 old_size;
1710 u64 size_to_free;
1711 struct btrfs_path *path;
1712 struct btrfs_key key;
1713 struct btrfs_chunk *chunk;
1714 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1715 struct btrfs_trans_handle *trans;
1716 struct btrfs_key found_key;
1717
1718 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1719 return -EROFS;
1720
1721 mutex_lock(&dev_root->fs_info->volume_mutex);
1722 dev_root = dev_root->fs_info->dev_root;
1723
1724 /* step one make some room on all the devices */
1725 list_for_each(cur, devices) {
1726 device = list_entry(cur, struct btrfs_device, dev_list);
1727 old_size = device->total_bytes;
1728 size_to_free = div_factor(old_size, 1);
1729 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1730 if (!device->writeable ||
1731 device->total_bytes - device->bytes_used > size_to_free)
1732 continue;
1733
1734 ret = btrfs_shrink_device(device, old_size - size_to_free);
1735 BUG_ON(ret);
1736
1737 trans = btrfs_start_transaction(dev_root, 1);
1738 BUG_ON(!trans);
1739
1740 ret = btrfs_grow_device(trans, device, old_size);
1741 BUG_ON(ret);
1742
1743 btrfs_end_transaction(trans, dev_root);
1744 }
1745
1746 /* step two, relocate all the chunks */
1747 path = btrfs_alloc_path();
1748 BUG_ON(!path);
1749
1750 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1751 key.offset = (u64)-1;
1752 key.type = BTRFS_CHUNK_ITEM_KEY;
1753
1754 while (1) {
1755 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1756 if (ret < 0)
1757 goto error;
1758
1759 /*
1760 * this shouldn't happen, it means the last relocate
1761 * failed
1762 */
1763 if (ret == 0)
1764 break;
1765
1766 ret = btrfs_previous_item(chunk_root, path, 0,
1767 BTRFS_CHUNK_ITEM_KEY);
1768 if (ret)
1769 break;
1770
1771 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1772 path->slots[0]);
1773 if (found_key.objectid != key.objectid)
1774 break;
1775
1776 chunk = btrfs_item_ptr(path->nodes[0],
1777 path->slots[0],
1778 struct btrfs_chunk);
1779 key.offset = found_key.offset;
1780 /* chunk zero is special */
1781 if (key.offset == 0)
1782 break;
1783
1784 btrfs_release_path(chunk_root, path);
1785 ret = btrfs_relocate_chunk(chunk_root,
1786 chunk_root->root_key.objectid,
1787 found_key.objectid,
1788 found_key.offset);
1789 BUG_ON(ret);
1790 }
1791 ret = 0;
1792error:
1793 btrfs_free_path(path);
1794 mutex_unlock(&dev_root->fs_info->volume_mutex);
1795 return ret;
1796}
1797
1798/*
1799 * shrinking a device means finding all of the device extents past
1800 * the new size, and then following the back refs to the chunks.
1801 * The chunk relocation code actually frees the device extent
1802 */
1803int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1804{
1805 struct btrfs_trans_handle *trans;
1806 struct btrfs_root *root = device->dev_root;
1807 struct btrfs_dev_extent *dev_extent = NULL;
1808 struct btrfs_path *path;
1809 u64 length;
1810 u64 chunk_tree;
1811 u64 chunk_objectid;
1812 u64 chunk_offset;
1813 int ret;
1814 int slot;
1815 struct extent_buffer *l;
1816 struct btrfs_key key;
1817 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1818 u64 old_total = btrfs_super_total_bytes(super_copy);
1819 u64 diff = device->total_bytes - new_size;
1820
1821 if (new_size >= device->total_bytes)
1822 return -EINVAL;
1823
1824 path = btrfs_alloc_path();
1825 if (!path)
1826 return -ENOMEM;
1827
1828 trans = btrfs_start_transaction(root, 1);
1829 if (!trans) {
1830 ret = -ENOMEM;
1831 goto done;
1832 }
1833
1834 path->reada = 2;
1835
1836 lock_chunks(root);
1837
1838 device->total_bytes = new_size;
1839 if (device->writeable)
1840 device->fs_devices->total_rw_bytes -= diff;
1841 ret = btrfs_update_device(trans, device);
1842 if (ret) {
1843 unlock_chunks(root);
1844 btrfs_end_transaction(trans, root);
1845 goto done;
1846 }
1847 WARN_ON(diff > old_total);
1848 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1849 unlock_chunks(root);
1850 btrfs_end_transaction(trans, root);
1851
1852 key.objectid = device->devid;
1853 key.offset = (u64)-1;
1854 key.type = BTRFS_DEV_EXTENT_KEY;
1855
1856 while (1) {
1857 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1858 if (ret < 0)
1859 goto done;
1860
1861 ret = btrfs_previous_item(root, path, 0, key.type);
1862 if (ret < 0)
1863 goto done;
1864 if (ret) {
1865 ret = 0;
1866 goto done;
1867 }
1868
1869 l = path->nodes[0];
1870 slot = path->slots[0];
1871 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1872
1873 if (key.objectid != device->devid)
1874 goto done;
1875
1876 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1877 length = btrfs_dev_extent_length(l, dev_extent);
1878
1879 if (key.offset + length <= new_size)
1880 goto done;
1881
1882 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1883 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1884 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1885 btrfs_release_path(root, path);
1886
1887 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1888 chunk_offset);
1889 if (ret)
1890 goto done;
1891 }
1892
1893done:
1894 btrfs_free_path(path);
1895 return ret;
1896}
1897
1898static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1899 struct btrfs_root *root,
1900 struct btrfs_key *key,
1901 struct btrfs_chunk *chunk, int item_size)
1902{
1903 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1904 struct btrfs_disk_key disk_key;
1905 u32 array_size;
1906 u8 *ptr;
1907
1908 array_size = btrfs_super_sys_array_size(super_copy);
1909 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1910 return -EFBIG;
1911
1912 ptr = super_copy->sys_chunk_array + array_size;
1913 btrfs_cpu_key_to_disk(&disk_key, key);
1914 memcpy(ptr, &disk_key, sizeof(disk_key));
1915 ptr += sizeof(disk_key);
1916 memcpy(ptr, chunk, item_size);
1917 item_size += sizeof(disk_key);
1918 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1919 return 0;
1920}
1921
1922static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
1923 int num_stripes, int sub_stripes)
1924{
1925 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1926 return calc_size;
1927 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1928 return calc_size * (num_stripes / sub_stripes);
1929 else
1930 return calc_size * num_stripes;
1931}
1932
1933static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *extent_root,
1935 struct map_lookup **map_ret,
1936 u64 *num_bytes, u64 *stripe_size,
1937 u64 start, u64 type)
1938{
1939 struct btrfs_fs_info *info = extent_root->fs_info;
1940 struct btrfs_device *device = NULL;
1941 struct btrfs_fs_devices *fs_devices = info->fs_devices;
1942 struct list_head *cur;
1943 struct map_lookup *map = NULL;
1944 struct extent_map_tree *em_tree;
1945 struct extent_map *em;
1946 struct list_head private_devs;
1947 int min_stripe_size = 1 * 1024 * 1024;
1948 u64 calc_size = 1024 * 1024 * 1024;
1949 u64 max_chunk_size = calc_size;
1950 u64 min_free;
1951 u64 avail;
1952 u64 max_avail = 0;
1953 u64 dev_offset;
1954 int num_stripes = 1;
1955 int min_stripes = 1;
1956 int sub_stripes = 0;
1957 int looped = 0;
1958 int ret;
1959 int index;
1960 int stripe_len = 64 * 1024;
1961
1962 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1963 (type & BTRFS_BLOCK_GROUP_DUP)) {
1964 WARN_ON(1);
1965 type &= ~BTRFS_BLOCK_GROUP_DUP;
1966 }
1967 if (list_empty(&fs_devices->alloc_list))
1968 return -ENOSPC;
1969
1970 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1971 num_stripes = fs_devices->rw_devices;
1972 min_stripes = 2;
1973 }
1974 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1975 num_stripes = 2;
1976 min_stripes = 2;
1977 }
1978 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1979 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1980 if (num_stripes < 2)
1981 return -ENOSPC;
1982 min_stripes = 2;
1983 }
1984 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1985 num_stripes = fs_devices->rw_devices;
1986 if (num_stripes < 4)
1987 return -ENOSPC;
1988 num_stripes &= ~(u32)1;
1989 sub_stripes = 2;
1990 min_stripes = 4;
1991 }
1992
1993 if (type & BTRFS_BLOCK_GROUP_DATA) {
1994 max_chunk_size = 10 * calc_size;
1995 min_stripe_size = 64 * 1024 * 1024;
1996 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1997 max_chunk_size = 4 * calc_size;
1998 min_stripe_size = 32 * 1024 * 1024;
1999 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2000 calc_size = 8 * 1024 * 1024;
2001 max_chunk_size = calc_size * 2;
2002 min_stripe_size = 1 * 1024 * 1024;
2003 }
2004
2005 /* we don't want a chunk larger than 10% of writeable space */
2006 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2007 max_chunk_size);
2008
2009again:
2010 if (!map || map->num_stripes != num_stripes) {
2011 kfree(map);
2012 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2013 if (!map)
2014 return -ENOMEM;
2015 map->num_stripes = num_stripes;
2016 }
2017
2018 if (calc_size * num_stripes > max_chunk_size) {
2019 calc_size = max_chunk_size;
2020 do_div(calc_size, num_stripes);
2021 do_div(calc_size, stripe_len);
2022 calc_size *= stripe_len;
2023 }
2024 /* we don't want tiny stripes */
2025 calc_size = max_t(u64, min_stripe_size, calc_size);
2026
2027 do_div(calc_size, stripe_len);
2028 calc_size *= stripe_len;
2029
2030 cur = fs_devices->alloc_list.next;
2031 index = 0;
2032
2033 if (type & BTRFS_BLOCK_GROUP_DUP)
2034 min_free = calc_size * 2;
2035 else
2036 min_free = calc_size;
2037
2038 /*
2039 * we add 1MB because we never use the first 1MB of the device, unless
2040 * we've looped, then we are likely allocating the maximum amount of
2041 * space left already
2042 */
2043 if (!looped)
2044 min_free += 1024 * 1024;
2045
2046 INIT_LIST_HEAD(&private_devs);
2047 while (index < num_stripes) {
2048 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2049 BUG_ON(!device->writeable);
2050 if (device->total_bytes > device->bytes_used)
2051 avail = device->total_bytes - device->bytes_used;
2052 else
2053 avail = 0;
2054 cur = cur->next;
2055
2056 if (device->in_fs_metadata && avail >= min_free) {
2057 ret = find_free_dev_extent(trans, device,
2058 min_free, &dev_offset);
2059 if (ret == 0) {
2060 list_move_tail(&device->dev_alloc_list,
2061 &private_devs);
2062 map->stripes[index].dev = device;
2063 map->stripes[index].physical = dev_offset;
2064 index++;
2065 if (type & BTRFS_BLOCK_GROUP_DUP) {
2066 map->stripes[index].dev = device;
2067 map->stripes[index].physical =
2068 dev_offset + calc_size;
2069 index++;
2070 }
2071 }
2072 } else if (device->in_fs_metadata && avail > max_avail)
2073 max_avail = avail;
2074 if (cur == &fs_devices->alloc_list)
2075 break;
2076 }
2077 list_splice(&private_devs, &fs_devices->alloc_list);
2078 if (index < num_stripes) {
2079 if (index >= min_stripes) {
2080 num_stripes = index;
2081 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2082 num_stripes /= sub_stripes;
2083 num_stripes *= sub_stripes;
2084 }
2085 looped = 1;
2086 goto again;
2087 }
2088 if (!looped && max_avail > 0) {
2089 looped = 1;
2090 calc_size = max_avail;
2091 goto again;
2092 }
2093 kfree(map);
2094 return -ENOSPC;
2095 }
2096 map->sector_size = extent_root->sectorsize;
2097 map->stripe_len = stripe_len;
2098 map->io_align = stripe_len;
2099 map->io_width = stripe_len;
2100 map->type = type;
2101 map->num_stripes = num_stripes;
2102 map->sub_stripes = sub_stripes;
2103
2104 *map_ret = map;
2105 *stripe_size = calc_size;
2106 *num_bytes = chunk_bytes_by_type(type, calc_size,
2107 num_stripes, sub_stripes);
2108
2109 em = alloc_extent_map(GFP_NOFS);
2110 if (!em) {
2111 kfree(map);
2112 return -ENOMEM;
2113 }
2114 em->bdev = (struct block_device *)map;
2115 em->start = start;
2116 em->len = *num_bytes;
2117 em->block_start = 0;
2118 em->block_len = em->len;
2119
2120 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2121 spin_lock(&em_tree->lock);
2122 ret = add_extent_mapping(em_tree, em);
2123 spin_unlock(&em_tree->lock);
2124 BUG_ON(ret);
2125 free_extent_map(em);
2126
2127 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2128 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2129 start, *num_bytes);
2130 BUG_ON(ret);
2131
2132 index = 0;
2133 while (index < map->num_stripes) {
2134 device = map->stripes[index].dev;
2135 dev_offset = map->stripes[index].physical;
2136
2137 ret = btrfs_alloc_dev_extent(trans, device,
2138 info->chunk_root->root_key.objectid,
2139 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2140 start, dev_offset, calc_size);
2141 BUG_ON(ret);
2142 index++;
2143 }
2144
2145 return 0;
2146}
2147
2148static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2149 struct btrfs_root *extent_root,
2150 struct map_lookup *map, u64 chunk_offset,
2151 u64 chunk_size, u64 stripe_size)
2152{
2153 u64 dev_offset;
2154 struct btrfs_key key;
2155 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2156 struct btrfs_device *device;
2157 struct btrfs_chunk *chunk;
2158 struct btrfs_stripe *stripe;
2159 size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2160 int index = 0;
2161 int ret;
2162
2163 chunk = kzalloc(item_size, GFP_NOFS);
2164 if (!chunk)
2165 return -ENOMEM;
2166
2167 index = 0;
2168 while (index < map->num_stripes) {
2169 device = map->stripes[index].dev;
2170 device->bytes_used += stripe_size;
2171 ret = btrfs_update_device(trans, device);
2172 BUG_ON(ret);
2173 index++;
2174 }
2175
2176 index = 0;
2177 stripe = &chunk->stripe;
2178 while (index < map->num_stripes) {
2179 device = map->stripes[index].dev;
2180 dev_offset = map->stripes[index].physical;
2181
2182 btrfs_set_stack_stripe_devid(stripe, device->devid);
2183 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2184 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2185 stripe++;
2186 index++;
2187 }
2188
2189 btrfs_set_stack_chunk_length(chunk, chunk_size);
2190 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2191 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2192 btrfs_set_stack_chunk_type(chunk, map->type);
2193 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2194 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2195 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2196 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2197 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2198
2199 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2200 key.type = BTRFS_CHUNK_ITEM_KEY;
2201 key.offset = chunk_offset;
2202
2203 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2204 BUG_ON(ret);
2205
2206 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2207 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2208 item_size);
2209 BUG_ON(ret);
2210 }
2211 kfree(chunk);
2212 return 0;
2213}
2214
2215/*
2216 * Chunk allocation falls into two parts. The first part does works
2217 * that make the new allocated chunk useable, but not do any operation
2218 * that modifies the chunk tree. The second part does the works that
2219 * require modifying the chunk tree. This division is important for the
2220 * bootstrap process of adding storage to a seed btrfs.
2221 */
2222int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2223 struct btrfs_root *extent_root, u64 type)
2224{
2225 u64 chunk_offset;
2226 u64 chunk_size;
2227 u64 stripe_size;
2228 struct map_lookup *map;
2229 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2230 int ret;
2231
2232 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2233 &chunk_offset);
2234 if (ret)
2235 return ret;
2236
2237 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2238 &stripe_size, chunk_offset, type);
2239 if (ret)
2240 return ret;
2241
2242 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2243 chunk_size, stripe_size);
2244 BUG_ON(ret);
2245 return 0;
2246}
2247
2248static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root,
2250 struct btrfs_device *device)
2251{
2252 u64 chunk_offset;
2253 u64 sys_chunk_offset;
2254 u64 chunk_size;
2255 u64 sys_chunk_size;
2256 u64 stripe_size;
2257 u64 sys_stripe_size;
2258 u64 alloc_profile;
2259 struct map_lookup *map;
2260 struct map_lookup *sys_map;
2261 struct btrfs_fs_info *fs_info = root->fs_info;
2262 struct btrfs_root *extent_root = fs_info->extent_root;
2263 int ret;
2264
2265 ret = find_next_chunk(fs_info->chunk_root,
2266 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2267 BUG_ON(ret);
2268
2269 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2270 (fs_info->metadata_alloc_profile &
2271 fs_info->avail_metadata_alloc_bits);
2272 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2273
2274 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2275 &stripe_size, chunk_offset, alloc_profile);
2276 BUG_ON(ret);
2277
2278 sys_chunk_offset = chunk_offset + chunk_size;
2279
2280 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2281 (fs_info->system_alloc_profile &
2282 fs_info->avail_system_alloc_bits);
2283 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2284
2285 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2286 &sys_chunk_size, &sys_stripe_size,
2287 sys_chunk_offset, alloc_profile);
2288 BUG_ON(ret);
2289
2290 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2291 BUG_ON(ret);
2292
2293 /*
2294 * Modifying chunk tree needs allocating new blocks from both
2295 * system block group and metadata block group. So we only can
2296 * do operations require modifying the chunk tree after both
2297 * block groups were created.
2298 */
2299 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2300 chunk_size, stripe_size);
2301 BUG_ON(ret);
2302
2303 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2304 sys_chunk_offset, sys_chunk_size,
2305 sys_stripe_size);
2306 BUG_ON(ret);
2307 return 0;
2308}
2309
2310int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2311{
2312 struct extent_map *em;
2313 struct map_lookup *map;
2314 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2315 int readonly = 0;
2316 int i;
2317
2318 spin_lock(&map_tree->map_tree.lock);
2319 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2320 spin_unlock(&map_tree->map_tree.lock);
2321 if (!em)
2322 return 1;
2323
2324 map = (struct map_lookup *)em->bdev;
2325 for (i = 0; i < map->num_stripes; i++) {
2326 if (!map->stripes[i].dev->writeable) {
2327 readonly = 1;
2328 break;
2329 }
2330 }
2331 free_extent_map(em);
2332 return readonly;
2333}
2334
2335void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2336{
2337 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2338}
2339
2340void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2341{
2342 struct extent_map *em;
2343
2344 while (1) {
2345 spin_lock(&tree->map_tree.lock);
2346 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2347 if (em)
2348 remove_extent_mapping(&tree->map_tree, em);
2349 spin_unlock(&tree->map_tree.lock);
2350 if (!em)
2351 break;
2352 kfree(em->bdev);
2353 /* once for us */
2354 free_extent_map(em);
2355 /* once for the tree */
2356 free_extent_map(em);
2357 }
2358}
2359
2360int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2361{
2362 struct extent_map *em;
2363 struct map_lookup *map;
2364 struct extent_map_tree *em_tree = &map_tree->map_tree;
2365 int ret;
2366
2367 spin_lock(&em_tree->lock);
2368 em = lookup_extent_mapping(em_tree, logical, len);
2369 spin_unlock(&em_tree->lock);
2370 BUG_ON(!em);
2371
2372 BUG_ON(em->start > logical || em->start + em->len < logical);
2373 map = (struct map_lookup *)em->bdev;
2374 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2375 ret = map->num_stripes;
2376 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2377 ret = map->sub_stripes;
2378 else
2379 ret = 1;
2380 free_extent_map(em);
2381 return ret;
2382}
2383
2384static int find_live_mirror(struct map_lookup *map, int first, int num,
2385 int optimal)
2386{
2387 int i;
2388 if (map->stripes[optimal].dev->bdev)
2389 return optimal;
2390 for (i = first; i < first + num; i++) {
2391 if (map->stripes[i].dev->bdev)
2392 return i;
2393 }
2394 /* we couldn't find one that doesn't fail. Just return something
2395 * and the io error handling code will clean up eventually
2396 */
2397 return optimal;
2398}
2399
2400static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2401 u64 logical, u64 *length,
2402 struct btrfs_multi_bio **multi_ret,
2403 int mirror_num, struct page *unplug_page)
2404{
2405 struct extent_map *em;
2406 struct map_lookup *map;
2407 struct extent_map_tree *em_tree = &map_tree->map_tree;
2408 u64 offset;
2409 u64 stripe_offset;
2410 u64 stripe_nr;
2411 int stripes_allocated = 8;
2412 int stripes_required = 1;
2413 int stripe_index;
2414 int i;
2415 int num_stripes;
2416 int max_errors = 0;
2417 struct btrfs_multi_bio *multi = NULL;
2418
2419 if (multi_ret && !(rw & (1 << BIO_RW)))
2420 stripes_allocated = 1;
2421again:
2422 if (multi_ret) {
2423 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2424 GFP_NOFS);
2425 if (!multi)
2426 return -ENOMEM;
2427
2428 atomic_set(&multi->error, 0);
2429 }
2430
2431 spin_lock(&em_tree->lock);
2432 em = lookup_extent_mapping(em_tree, logical, *length);
2433 spin_unlock(&em_tree->lock);
2434
2435 if (!em && unplug_page)
2436 return 0;
2437
2438 if (!em) {
2439 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2440 (unsigned long long)logical,
2441 (unsigned long long)*length);
2442 BUG();
2443 }
2444
2445 BUG_ON(em->start > logical || em->start + em->len < logical);
2446 map = (struct map_lookup *)em->bdev;
2447 offset = logical - em->start;
2448
2449 if (mirror_num > map->num_stripes)
2450 mirror_num = 0;
2451
2452 /* if our multi bio struct is too small, back off and try again */
2453 if (rw & (1 << BIO_RW)) {
2454 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2455 BTRFS_BLOCK_GROUP_DUP)) {
2456 stripes_required = map->num_stripes;
2457 max_errors = 1;
2458 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2459 stripes_required = map->sub_stripes;
2460 max_errors = 1;
2461 }
2462 }
2463 if (multi_ret && rw == WRITE &&
2464 stripes_allocated < stripes_required) {
2465 stripes_allocated = map->num_stripes;
2466 free_extent_map(em);
2467 kfree(multi);
2468 goto again;
2469 }
2470 stripe_nr = offset;
2471 /*
2472 * stripe_nr counts the total number of stripes we have to stride
2473 * to get to this block
2474 */
2475 do_div(stripe_nr, map->stripe_len);
2476
2477 stripe_offset = stripe_nr * map->stripe_len;
2478 BUG_ON(offset < stripe_offset);
2479
2480 /* stripe_offset is the offset of this block in its stripe*/
2481 stripe_offset = offset - stripe_offset;
2482
2483 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2484 BTRFS_BLOCK_GROUP_RAID10 |
2485 BTRFS_BLOCK_GROUP_DUP)) {
2486 /* we limit the length of each bio to what fits in a stripe */
2487 *length = min_t(u64, em->len - offset,
2488 map->stripe_len - stripe_offset);
2489 } else {
2490 *length = em->len - offset;
2491 }
2492
2493 if (!multi_ret && !unplug_page)
2494 goto out;
2495
2496 num_stripes = 1;
2497 stripe_index = 0;
2498 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2499 if (unplug_page || (rw & (1 << BIO_RW)))
2500 num_stripes = map->num_stripes;
2501 else if (mirror_num)
2502 stripe_index = mirror_num - 1;
2503 else {
2504 stripe_index = find_live_mirror(map, 0,
2505 map->num_stripes,
2506 current->pid % map->num_stripes);
2507 }
2508
2509 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2510 if (rw & (1 << BIO_RW))
2511 num_stripes = map->num_stripes;
2512 else if (mirror_num)
2513 stripe_index = mirror_num - 1;
2514
2515 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2516 int factor = map->num_stripes / map->sub_stripes;
2517
2518 stripe_index = do_div(stripe_nr, factor);
2519 stripe_index *= map->sub_stripes;
2520
2521 if (unplug_page || (rw & (1 << BIO_RW)))
2522 num_stripes = map->sub_stripes;
2523 else if (mirror_num)
2524 stripe_index += mirror_num - 1;
2525 else {
2526 stripe_index = find_live_mirror(map, stripe_index,
2527 map->sub_stripes, stripe_index +
2528 current->pid % map->sub_stripes);
2529 }
2530 } else {
2531 /*
2532 * after this do_div call, stripe_nr is the number of stripes
2533 * on this device we have to walk to find the data, and
2534 * stripe_index is the number of our device in the stripe array
2535 */
2536 stripe_index = do_div(stripe_nr, map->num_stripes);
2537 }
2538 BUG_ON(stripe_index >= map->num_stripes);
2539
2540 for (i = 0; i < num_stripes; i++) {
2541 if (unplug_page) {
2542 struct btrfs_device *device;
2543 struct backing_dev_info *bdi;
2544
2545 device = map->stripes[stripe_index].dev;
2546 if (device->bdev) {
2547 bdi = blk_get_backing_dev_info(device->bdev);
2548 if (bdi->unplug_io_fn)
2549 bdi->unplug_io_fn(bdi, unplug_page);
2550 }
2551 } else {
2552 multi->stripes[i].physical =
2553 map->stripes[stripe_index].physical +
2554 stripe_offset + stripe_nr * map->stripe_len;
2555 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2556 }
2557 stripe_index++;
2558 }
2559 if (multi_ret) {
2560 *multi_ret = multi;
2561 multi->num_stripes = num_stripes;
2562 multi->max_errors = max_errors;
2563 }
2564out:
2565 free_extent_map(em);
2566 return 0;
2567}
2568
2569int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2570 u64 logical, u64 *length,
2571 struct btrfs_multi_bio **multi_ret, int mirror_num)
2572{
2573 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2574 mirror_num, NULL);
2575}
2576
2577int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2578 u64 chunk_start, u64 physical, u64 devid,
2579 u64 **logical, int *naddrs, int *stripe_len)
2580{
2581 struct extent_map_tree *em_tree = &map_tree->map_tree;
2582 struct extent_map *em;
2583 struct map_lookup *map;
2584 u64 *buf;
2585 u64 bytenr;
2586 u64 length;
2587 u64 stripe_nr;
2588 int i, j, nr = 0;
2589
2590 spin_lock(&em_tree->lock);
2591 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2592 spin_unlock(&em_tree->lock);
2593
2594 BUG_ON(!em || em->start != chunk_start);
2595 map = (struct map_lookup *)em->bdev;
2596
2597 length = em->len;
2598 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2599 do_div(length, map->num_stripes / map->sub_stripes);
2600 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
2601 do_div(length, map->num_stripes);
2602
2603 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
2604 BUG_ON(!buf);
2605
2606 for (i = 0; i < map->num_stripes; i++) {
2607 if (devid && map->stripes[i].dev->devid != devid)
2608 continue;
2609 if (map->stripes[i].physical > physical ||
2610 map->stripes[i].physical + length <= physical)
2611 continue;
2612
2613 stripe_nr = physical - map->stripes[i].physical;
2614 do_div(stripe_nr, map->stripe_len);
2615
2616 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2617 stripe_nr = stripe_nr * map->num_stripes + i;
2618 do_div(stripe_nr, map->sub_stripes);
2619 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2620 stripe_nr = stripe_nr * map->num_stripes + i;
2621 }
2622 bytenr = chunk_start + stripe_nr * map->stripe_len;
2623 WARN_ON(nr >= map->num_stripes);
2624 for (j = 0; j < nr; j++) {
2625 if (buf[j] == bytenr)
2626 break;
2627 }
2628 if (j == nr) {
2629 WARN_ON(nr >= map->num_stripes);
2630 buf[nr++] = bytenr;
2631 }
2632 }
2633
2634 for (i = 0; i > nr; i++) {
2635 struct btrfs_multi_bio *multi;
2636 struct btrfs_bio_stripe *stripe;
2637 int ret;
2638
2639 length = 1;
2640 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2641 &length, &multi, 0);
2642 BUG_ON(ret);
2643
2644 stripe = multi->stripes;
2645 for (j = 0; j < multi->num_stripes; j++) {
2646 if (stripe->physical >= physical &&
2647 physical < stripe->physical + length)
2648 break;
2649 }
2650 BUG_ON(j >= multi->num_stripes);
2651 kfree(multi);
2652 }
2653
2654 *logical = buf;
2655 *naddrs = nr;
2656 *stripe_len = map->stripe_len;
2657
2658 free_extent_map(em);
2659 return 0;
2660}
2661
2662int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2663 u64 logical, struct page *page)
2664{
2665 u64 length = PAGE_CACHE_SIZE;
2666 return __btrfs_map_block(map_tree, READ, logical, &length,
2667 NULL, 0, page);
2668}
2669
2670static void end_bio_multi_stripe(struct bio *bio, int err)
2671{
2672 struct btrfs_multi_bio *multi = bio->bi_private;
2673 int is_orig_bio = 0;
2674
2675 if (err)
2676 atomic_inc(&multi->error);
2677
2678 if (bio == multi->orig_bio)
2679 is_orig_bio = 1;
2680
2681 if (atomic_dec_and_test(&multi->stripes_pending)) {
2682 if (!is_orig_bio) {
2683 bio_put(bio);
2684 bio = multi->orig_bio;
2685 }
2686 bio->bi_private = multi->private;
2687 bio->bi_end_io = multi->end_io;
2688 /* only send an error to the higher layers if it is
2689 * beyond the tolerance of the multi-bio
2690 */
2691 if (atomic_read(&multi->error) > multi->max_errors) {
2692 err = -EIO;
2693 } else if (err) {
2694 /*
2695 * this bio is actually up to date, we didn't
2696 * go over the max number of errors
2697 */
2698 set_bit(BIO_UPTODATE, &bio->bi_flags);
2699 err = 0;
2700 }
2701 kfree(multi);
2702
2703 bio_endio(bio, err);
2704 } else if (!is_orig_bio) {
2705 bio_put(bio);
2706 }
2707}
2708
2709struct async_sched {
2710 struct bio *bio;
2711 int rw;
2712 struct btrfs_fs_info *info;
2713 struct btrfs_work work;
2714};
2715
2716/*
2717 * see run_scheduled_bios for a description of why bios are collected for
2718 * async submit.
2719 *
2720 * This will add one bio to the pending list for a device and make sure
2721 * the work struct is scheduled.
2722 */
2723static noinline int schedule_bio(struct btrfs_root *root,
2724 struct btrfs_device *device,
2725 int rw, struct bio *bio)
2726{
2727 int should_queue = 1;
2728
2729 /* don't bother with additional async steps for reads, right now */
2730 if (!(rw & (1 << BIO_RW))) {
2731 bio_get(bio);
2732 submit_bio(rw, bio);
2733 bio_put(bio);
2734 return 0;
2735 }
2736
2737 /*
2738 * nr_async_bios allows us to reliably return congestion to the
2739 * higher layers. Otherwise, the async bio makes it appear we have
2740 * made progress against dirty pages when we've really just put it
2741 * on a queue for later
2742 */
2743 atomic_inc(&root->fs_info->nr_async_bios);
2744 WARN_ON(bio->bi_next);
2745 bio->bi_next = NULL;
2746 bio->bi_rw |= rw;
2747
2748 spin_lock(&device->io_lock);
2749
2750 if (device->pending_bio_tail)
2751 device->pending_bio_tail->bi_next = bio;
2752
2753 device->pending_bio_tail = bio;
2754 if (!device->pending_bios)
2755 device->pending_bios = bio;
2756 if (device->running_pending)
2757 should_queue = 0;
2758
2759 spin_unlock(&device->io_lock);
2760
2761 if (should_queue)
2762 btrfs_queue_worker(&root->fs_info->submit_workers,
2763 &device->work);
2764 return 0;
2765}
2766
2767int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2768 int mirror_num, int async_submit)
2769{
2770 struct btrfs_mapping_tree *map_tree;
2771 struct btrfs_device *dev;
2772 struct bio *first_bio = bio;
2773 u64 logical = (u64)bio->bi_sector << 9;
2774 u64 length = 0;
2775 u64 map_length;
2776 struct btrfs_multi_bio *multi = NULL;
2777 int ret;
2778 int dev_nr = 0;
2779 int total_devs = 1;
2780
2781 length = bio->bi_size;
2782 map_tree = &root->fs_info->mapping_tree;
2783 map_length = length;
2784
2785 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2786 mirror_num);
2787 BUG_ON(ret);
2788
2789 total_devs = multi->num_stripes;
2790 if (map_length < length) {
2791 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
2792 "len %llu\n", (unsigned long long)logical,
2793 (unsigned long long)length,
2794 (unsigned long long)map_length);
2795 BUG();
2796 }
2797 multi->end_io = first_bio->bi_end_io;
2798 multi->private = first_bio->bi_private;
2799 multi->orig_bio = first_bio;
2800 atomic_set(&multi->stripes_pending, multi->num_stripes);
2801
2802 while (dev_nr < total_devs) {
2803 if (total_devs > 1) {
2804 if (dev_nr < total_devs - 1) {
2805 bio = bio_clone(first_bio, GFP_NOFS);
2806 BUG_ON(!bio);
2807 } else {
2808 bio = first_bio;
2809 }
2810 bio->bi_private = multi;
2811 bio->bi_end_io = end_bio_multi_stripe;
2812 }
2813 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2814 dev = multi->stripes[dev_nr].dev;
2815 BUG_ON(rw == WRITE && !dev->writeable);
2816 if (dev && dev->bdev) {
2817 bio->bi_bdev = dev->bdev;
2818 if (async_submit)
2819 schedule_bio(root, dev, rw, bio);
2820 else
2821 submit_bio(rw, bio);
2822 } else {
2823 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2824 bio->bi_sector = logical >> 9;
2825 bio_endio(bio, -EIO);
2826 }
2827 dev_nr++;
2828 }
2829 if (total_devs == 1)
2830 kfree(multi);
2831 return 0;
2832}
2833
2834struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2835 u8 *uuid, u8 *fsid)
2836{
2837 struct btrfs_device *device;
2838 struct btrfs_fs_devices *cur_devices;
2839
2840 cur_devices = root->fs_info->fs_devices;
2841 while (cur_devices) {
2842 if (!fsid ||
2843 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2844 device = __find_device(&cur_devices->devices,
2845 devid, uuid);
2846 if (device)
2847 return device;
2848 }
2849 cur_devices = cur_devices->seed;
2850 }
2851 return NULL;
2852}
2853
2854static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2855 u64 devid, u8 *dev_uuid)
2856{
2857 struct btrfs_device *device;
2858 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2859
2860 device = kzalloc(sizeof(*device), GFP_NOFS);
2861 if (!device)
2862 return NULL;
2863 list_add(&device->dev_list,
2864 &fs_devices->devices);
2865 device->barriers = 1;
2866 device->dev_root = root->fs_info->dev_root;
2867 device->devid = devid;
2868 device->work.func = pending_bios_fn;
2869 device->fs_devices = fs_devices;
2870 fs_devices->num_devices++;
2871 spin_lock_init(&device->io_lock);
2872 INIT_LIST_HEAD(&device->dev_alloc_list);
2873 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2874 return device;
2875}
2876
2877static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2878 struct extent_buffer *leaf,
2879 struct btrfs_chunk *chunk)
2880{
2881 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2882 struct map_lookup *map;
2883 struct extent_map *em;
2884 u64 logical;
2885 u64 length;
2886 u64 devid;
2887 u8 uuid[BTRFS_UUID_SIZE];
2888 int num_stripes;
2889 int ret;
2890 int i;
2891
2892 logical = key->offset;
2893 length = btrfs_chunk_length(leaf, chunk);
2894
2895 spin_lock(&map_tree->map_tree.lock);
2896 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2897 spin_unlock(&map_tree->map_tree.lock);
2898
2899 /* already mapped? */
2900 if (em && em->start <= logical && em->start + em->len > logical) {
2901 free_extent_map(em);
2902 return 0;
2903 } else if (em) {
2904 free_extent_map(em);
2905 }
2906
2907 map = kzalloc(sizeof(*map), GFP_NOFS);
2908 if (!map)
2909 return -ENOMEM;
2910
2911 em = alloc_extent_map(GFP_NOFS);
2912 if (!em)
2913 return -ENOMEM;
2914 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2915 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2916 if (!map) {
2917 free_extent_map(em);
2918 return -ENOMEM;
2919 }
2920
2921 em->bdev = (struct block_device *)map;
2922 em->start = logical;
2923 em->len = length;
2924 em->block_start = 0;
2925 em->block_len = em->len;
2926
2927 map->num_stripes = num_stripes;
2928 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2929 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2930 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2931 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2932 map->type = btrfs_chunk_type(leaf, chunk);
2933 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2934 for (i = 0; i < num_stripes; i++) {
2935 map->stripes[i].physical =
2936 btrfs_stripe_offset_nr(leaf, chunk, i);
2937 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2938 read_extent_buffer(leaf, uuid, (unsigned long)
2939 btrfs_stripe_dev_uuid_nr(chunk, i),
2940 BTRFS_UUID_SIZE);
2941 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2942 NULL);
2943 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2944 kfree(map);
2945 free_extent_map(em);
2946 return -EIO;
2947 }
2948 if (!map->stripes[i].dev) {
2949 map->stripes[i].dev =
2950 add_missing_dev(root, devid, uuid);
2951 if (!map->stripes[i].dev) {
2952 kfree(map);
2953 free_extent_map(em);
2954 return -EIO;
2955 }
2956 }
2957 map->stripes[i].dev->in_fs_metadata = 1;
2958 }
2959
2960 spin_lock(&map_tree->map_tree.lock);
2961 ret = add_extent_mapping(&map_tree->map_tree, em);
2962 spin_unlock(&map_tree->map_tree.lock);
2963 BUG_ON(ret);
2964 free_extent_map(em);
2965
2966 return 0;
2967}
2968
2969static int fill_device_from_item(struct extent_buffer *leaf,
2970 struct btrfs_dev_item *dev_item,
2971 struct btrfs_device *device)
2972{
2973 unsigned long ptr;
2974
2975 device->devid = btrfs_device_id(leaf, dev_item);
2976 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2977 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2978 device->type = btrfs_device_type(leaf, dev_item);
2979 device->io_align = btrfs_device_io_align(leaf, dev_item);
2980 device->io_width = btrfs_device_io_width(leaf, dev_item);
2981 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2982
2983 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2984 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2985
2986 return 0;
2987}
2988
2989static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2990{
2991 struct btrfs_fs_devices *fs_devices;
2992 int ret;
2993
2994 mutex_lock(&uuid_mutex);
2995
2996 fs_devices = root->fs_info->fs_devices->seed;
2997 while (fs_devices) {
2998 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2999 ret = 0;
3000 goto out;
3001 }
3002 fs_devices = fs_devices->seed;
3003 }
3004
3005 fs_devices = find_fsid(fsid);
3006 if (!fs_devices) {
3007 ret = -ENOENT;
3008 goto out;
3009 }
3010
3011 fs_devices = clone_fs_devices(fs_devices);
3012 if (IS_ERR(fs_devices)) {
3013 ret = PTR_ERR(fs_devices);
3014 goto out;
3015 }
3016
3017 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3018 root->fs_info->bdev_holder);
3019 if (ret)
3020 goto out;
3021
3022 if (!fs_devices->seeding) {
3023 __btrfs_close_devices(fs_devices);
3024 free_fs_devices(fs_devices);
3025 ret = -EINVAL;
3026 goto out;
3027 }
3028
3029 fs_devices->seed = root->fs_info->fs_devices->seed;
3030 root->fs_info->fs_devices->seed = fs_devices;
3031out:
3032 mutex_unlock(&uuid_mutex);
3033 return ret;
3034}
3035
3036static int read_one_dev(struct btrfs_root *root,
3037 struct extent_buffer *leaf,
3038 struct btrfs_dev_item *dev_item)
3039{
3040 struct btrfs_device *device;
3041 u64 devid;
3042 int ret;
3043 u8 fs_uuid[BTRFS_UUID_SIZE];
3044 u8 dev_uuid[BTRFS_UUID_SIZE];
3045
3046 devid = btrfs_device_id(leaf, dev_item);
3047 read_extent_buffer(leaf, dev_uuid,
3048 (unsigned long)btrfs_device_uuid(dev_item),
3049 BTRFS_UUID_SIZE);
3050 read_extent_buffer(leaf, fs_uuid,
3051 (unsigned long)btrfs_device_fsid(dev_item),
3052 BTRFS_UUID_SIZE);
3053
3054 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3055 ret = open_seed_devices(root, fs_uuid);
3056 if (ret && !btrfs_test_opt(root, DEGRADED))
3057 return ret;
3058 }
3059
3060 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3061 if (!device || !device->bdev) {
3062 if (!btrfs_test_opt(root, DEGRADED))
3063 return -EIO;
3064
3065 if (!device) {
3066 printk(KERN_WARNING "warning devid %llu missing\n",
3067 (unsigned long long)devid);
3068 device = add_missing_dev(root, devid, dev_uuid);
3069 if (!device)
3070 return -ENOMEM;
3071 }
3072 }
3073
3074 if (device->fs_devices != root->fs_info->fs_devices) {
3075 BUG_ON(device->writeable);
3076 if (device->generation !=
3077 btrfs_device_generation(leaf, dev_item))
3078 return -EINVAL;
3079 }
3080
3081 fill_device_from_item(leaf, dev_item, device);
3082 device->dev_root = root->fs_info->dev_root;
3083 device->in_fs_metadata = 1;
3084 if (device->writeable)
3085 device->fs_devices->total_rw_bytes += device->total_bytes;
3086 ret = 0;
3087 return ret;
3088}
3089
3090int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3091{
3092 struct btrfs_dev_item *dev_item;
3093
3094 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3095 dev_item);
3096 return read_one_dev(root, buf, dev_item);
3097}
3098
3099int btrfs_read_sys_array(struct btrfs_root *root)
3100{
3101 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3102 struct extent_buffer *sb;
3103 struct btrfs_disk_key *disk_key;
3104 struct btrfs_chunk *chunk;
3105 u8 *ptr;
3106 unsigned long sb_ptr;
3107 int ret = 0;
3108 u32 num_stripes;
3109 u32 array_size;
3110 u32 len = 0;
3111 u32 cur;
3112 struct btrfs_key key;
3113
3114 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3115 BTRFS_SUPER_INFO_SIZE);
3116 if (!sb)
3117 return -ENOMEM;
3118 btrfs_set_buffer_uptodate(sb);
3119 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3120 array_size = btrfs_super_sys_array_size(super_copy);
3121
3122 ptr = super_copy->sys_chunk_array;
3123 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3124 cur = 0;
3125
3126 while (cur < array_size) {
3127 disk_key = (struct btrfs_disk_key *)ptr;
3128 btrfs_disk_key_to_cpu(&key, disk_key);
3129
3130 len = sizeof(*disk_key); ptr += len;
3131 sb_ptr += len;
3132 cur += len;
3133
3134 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3135 chunk = (struct btrfs_chunk *)sb_ptr;
3136 ret = read_one_chunk(root, &key, sb, chunk);
3137 if (ret)
3138 break;
3139 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3140 len = btrfs_chunk_item_size(num_stripes);
3141 } else {
3142 ret = -EIO;
3143 break;
3144 }
3145 ptr += len;
3146 sb_ptr += len;
3147 cur += len;
3148 }
3149 free_extent_buffer(sb);
3150 return ret;
3151}
3152
3153int btrfs_read_chunk_tree(struct btrfs_root *root)
3154{
3155 struct btrfs_path *path;
3156 struct extent_buffer *leaf;
3157 struct btrfs_key key;
3158 struct btrfs_key found_key;
3159 int ret;
3160 int slot;
3161
3162 root = root->fs_info->chunk_root;
3163
3164 path = btrfs_alloc_path();
3165 if (!path)
3166 return -ENOMEM;
3167
3168 /* first we search for all of the device items, and then we
3169 * read in all of the chunk items. This way we can create chunk
3170 * mappings that reference all of the devices that are afound
3171 */
3172 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3173 key.offset = 0;
3174 key.type = 0;
3175again:
3176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3177 while (1) {
3178 leaf = path->nodes[0];
3179 slot = path->slots[0];
3180 if (slot >= btrfs_header_nritems(leaf)) {
3181 ret = btrfs_next_leaf(root, path);
3182 if (ret == 0)
3183 continue;
3184 if (ret < 0)
3185 goto error;
3186 break;
3187 }
3188 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3189 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3190 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3191 break;
3192 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3193 struct btrfs_dev_item *dev_item;
3194 dev_item = btrfs_item_ptr(leaf, slot,
3195 struct btrfs_dev_item);
3196 ret = read_one_dev(root, leaf, dev_item);
3197 if (ret)
3198 goto error;
3199 }
3200 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3201 struct btrfs_chunk *chunk;
3202 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3203 ret = read_one_chunk(root, &found_key, leaf, chunk);
3204 if (ret)
3205 goto error;
3206 }
3207 path->slots[0]++;
3208 }
3209 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3210 key.objectid = 0;
3211 btrfs_release_path(root, path);
3212 goto again;
3213 }
3214 ret = 0;
3215error:
3216 btrfs_free_path(path);
3217 return ret;
3218}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int writeable;
38 int in_fs_metadata;
39
40 spinlock_t io_lock;
41
42 struct block_device *bdev;
43
44 /* the mode sent to open_bdev_exclusive */
45 fmode_t mode;
46
47 char *name;
48
49 /* the internal btrfs device id */
50 u64 devid;
51
52 /* size of the device */
53 u64 total_bytes;
54
55 /* bytes used */
56 u64 bytes_used;
57
58 /* optimal io alignment for this device */
59 u32 io_align;
60
61 /* optimal io width for this device */
62 u32 io_width;
63
64 /* minimal io size for this device */
65 u32 sector_size;
66
67 /* type and info about this device */
68 u64 type;
69
70 /* physical drive uuid (or lvm uuid) */
71 u8 uuid[BTRFS_UUID_SIZE];
72
73 struct btrfs_work work;
74};
75
76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78
79 /* the device with this id has the most recent coyp of the super */
80 u64 latest_devid;
81 u64 latest_trans;
82 u64 num_devices;
83 u64 open_devices;
84 u64 rw_devices;
85 u64 total_rw_bytes;
86 struct block_device *latest_bdev;
87 /* all of the devices in the FS */
88 struct list_head devices;
89
90 /* devices not currently being allocated */
91 struct list_head alloc_list;
92 struct list_head list;
93
94 struct btrfs_fs_devices *seed;
95 int seeding;
96
97 int opened;
98};
99
100struct btrfs_bio_stripe {
101 struct btrfs_device *dev;
102 u64 physical;
103};
104
105struct btrfs_multi_bio {
106 atomic_t stripes_pending;
107 bio_end_io_t *end_io;
108 struct bio *orig_bio;
109 void *private;
110 atomic_t error;
111 int max_errors;
112 int num_stripes;
113 struct btrfs_bio_stripe stripes[];
114};
115
116#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
117 (sizeof(struct btrfs_bio_stripe) * (n)))
118
119int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
120 struct btrfs_device *device,
121 u64 chunk_tree, u64 chunk_objectid,
122 u64 chunk_offset, u64 start, u64 num_bytes);
123int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
124 u64 logical, u64 *length,
125 struct btrfs_multi_bio **multi_ret, int mirror_num);
126int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
127 u64 chunk_start, u64 physical, u64 devid,
128 u64 **logical, int *naddrs, int *stripe_len);
129int btrfs_read_sys_array(struct btrfs_root *root);
130int btrfs_read_chunk_tree(struct btrfs_root *root);
131int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
132 struct btrfs_root *extent_root, u64 type);
133void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
134void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
135int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
136 int mirror_num, int async_submit);
137int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
138int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
139 fmode_t flags, void *holder);
140int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
141 struct btrfs_fs_devices **fs_devices_ret);
142int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
143int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
144int btrfs_add_device(struct btrfs_trans_handle *trans,
145 struct btrfs_root *root,
146 struct btrfs_device *device);
147int btrfs_rm_device(struct btrfs_root *root, char *device_path);
148int btrfs_cleanup_fs_uuids(void);
149int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
150int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
151 u64 logical, struct page *page);
152int btrfs_grow_device(struct btrfs_trans_handle *trans,
153 struct btrfs_device *device, u64 new_size);
154struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
155 u8 *uuid, u8 *fsid);
156int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
157int btrfs_init_new_device(struct btrfs_root *root, char *path);
158int btrfs_balance(struct btrfs_root *dev_root);
159void btrfs_unlock_volumes(void);
160void btrfs_lock_volumes(void);
161int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
162#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 goto err;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX,
268 XATTR_SECURITY_PREFIX_LEN) ||
269 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
270 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
271 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
272}
273
274ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
275 void *buffer, size_t size)
276{
277 /*
278 * If this is a request for a synthetic attribute in the system.*
279 * namespace use the generic infrastructure to resolve a handler
280 * for it via sb->s_xattr.
281 */
282 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
283 return generic_getxattr(dentry, name, buffer, size);
284
285 if (!btrfs_is_valid_xattr(name))
286 return -EOPNOTSUPP;
287 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
288}
289
290int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
291 size_t size, int flags)
292{
293 /*
294 * If this is a request for a synthetic attribute in the system.*
295 * namespace use the generic infrastructure to resolve a handler
296 * for it via sb->s_xattr.
297 */
298 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
299 return generic_setxattr(dentry, name, value, size, flags);
300
301 if (!btrfs_is_valid_xattr(name))
302 return -EOPNOTSUPP;
303
304 if (size == 0)
305 value = ""; /* empty EA, do not remove */
306 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
307}
308
309int btrfs_removexattr(struct dentry *dentry, const char *name)
310{
311 /*
312 * If this is a request for a synthetic attribute in the system.*
313 * namespace use the generic infrastructure to resolve a handler
314 * for it via sb->s_xattr.
315 */
316 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
317 return generic_removexattr(dentry, name);
318
319 if (!btrfs_is_valid_xattr(name))
320 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33#include "compression.h"
34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace {
45 z_stream inf_strm;
46 z_stream def_strm;
47 char *buf;
48 struct list_head list;
49};
50
51static LIST_HEAD(idle_workspace);
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{
63 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148
149/*
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165}
166
167/*
168 * given an address space and start/len, compress the bytes.
169 *
170 * pages are allocated to hold the compressed result and stored
171 * in 'pages'
172 *
173 * out_pages is used to return the number of pages allocated. There
174 * may be pages allocated even if we return an error
175 *
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{
195 int ret;
196 struct workspace *workspace;
197 char *data_in;
198 char *cpage_out;
199 int nr_pages = 0;
200 struct page *in_page = NULL;
201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left;
205
206 *out_pages = 0;
207 *total_out = 0;
208 *total_in = 0;
209
210 workspace = find_zlib_workspace();
211 if (!workspace)
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1;
217 goto out;
218 }
219
220 workspace->def_strm.total_in = 0;
221 workspace->def_strm.total_out = 0;
222
223 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
224 data_in = kmap(in_page);
225
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
227 cpage_out = kmap(out_page);
228 pages[0] = out_page;
229 nr_pages = 1;
230
231 workspace->def_strm.next_in = data_in;
232 workspace->def_strm.next_out = cpage_out;
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) {
242 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
243 ret);
244 zlib_deflateEnd(&workspace->def_strm);
245 ret = -1;
246 goto out;
247 }
248
249 /* we're making it bigger, give up */
250 if (workspace->def_strm.total_in > 8192 &&
251 workspace->def_strm.total_in <
252 workspace->def_strm.total_out) {
253 ret = -1;
254 goto out;
255 }
256 /* we need another page for writing out. Test this
257 * before the total_in so we will pull in a new page for
258 * the stream end if required
259 */
260 if (workspace->def_strm.avail_out == 0) {
261 kunmap(out_page);
262 if (nr_pages == nr_dest_pages) {
263 out_page = NULL;
264 ret = -1;
265 goto out;
266 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
268 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page;
270 nr_pages++;
271 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
272 workspace->def_strm.next_out = cpage_out;
273 }
274 /* we're all done */
275 if (workspace->def_strm.total_in >= len)
276 break;
277
278 /* we've read in a full page, get a new one */
279 if (workspace->def_strm.avail_in == 0) {
280 if (workspace->def_strm.total_out > max_out)
281 break;
282
283 bytes_left = len - workspace->def_strm.total_in;
284 kunmap(in_page);
285 page_cache_release(in_page);
286
287 start += PAGE_CACHE_SIZE;
288 in_page = find_get_page(mapping,
289 start >> PAGE_CACHE_SHIFT);
290 data_in = kmap(in_page);
291 workspace->def_strm.avail_in = min(bytes_left,
292 PAGE_CACHE_SIZE);
293 workspace->def_strm.next_in = data_in;
294 }
295 }
296 workspace->def_strm.avail_in = 0;
297 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
298 zlib_deflateEnd(&workspace->def_strm);
299
300 if (ret != Z_STREAM_END) {
301 ret = -1;
302 goto out;
303 }
304
305 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
306 ret = -1;
307 goto out;
308 }
309
310 ret = 0;
311 *total_out = workspace->def_strm.total_out;
312 *total_in = workspace->def_strm.total_in;
313out:
314 *out_pages = nr_pages;
315 if (out_page)
316 kunmap(out_page);
317
318 if (in_page) {
319 kunmap(in_page);
320 page_cache_release(in_page);
321 }
322 free_workspace(workspace);
323 return ret;
324}
325
326/*
327 * pages_in is an array of pages with compressed data.
328 *
329 * disk_start is the starting logical offset of this array in the file
330 *
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{
348 int ret = 0;
349 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in;
352 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE;
359 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (!workspace)
370 return -ENOMEM;
371
372 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in;
374 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
375 workspace->inf_strm.total_in = 0;
376
377 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0;
383
384 /* If it's deflate, and it's got no preset dictionary, then
385 we can tell zlib to skip the adler32 check. */
386 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
387 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
388 !(((data_in[0]<<8) + data_in[1]) % 31)) {
389
390 wbits = -((data_in[0] >> 4) + 8);
391 workspace->inf_strm.next_in += 2;
392 workspace->inf_strm.avail_in -= 2;
393 }
394
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1;
398 goto out;
399 }
400 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END)
403 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409
410 /* total_out is the last byte of the workspace buffer */
411 total_out = workspace->inf_strm.total_out;
412
413 working_bytes = total_out - buf_start;
414
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break;
428 }
429
430 /* we haven't yet hit data corresponding to this page */
431 if (total_out <= start_byte)
432 goto next;
433
434 /*
435 * the start of the data we care about is offset into
436 * the middle of our working buffer
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 }
497next:
498 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500
501 if (workspace->inf_strm.avail_in == 0) {
502 unsigned long tmp;
503 kunmap(pages_in[page_in_index]);
504 page_in_index++;
505 if (page_in_index >= total_pages_in) {
506 data_in = NULL;
507 break;
508 }
509 data_in = kmap(pages_in[page_in_index]);
510 workspace->inf_strm.next_in = data_in;
511 tmp = srclen - workspace->inf_strm.total_in;
512 workspace->inf_strm.avail_in = min(tmp,
513 PAGE_CACHE_SIZE);
514 }
515 }
516 if (ret != Z_STREAM_END)
517 ret = -1;
518 else
519 ret = 0;
520done:
521 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in)
523 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret;
527}
528
529/*
530 * a less complex decompression routine. Our compressed data fits in a
531 * single page, and we want to read a single page out of it.
532 * start_byte tells us the offset into the compressed data we're interested in
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{
539 int ret = 0;
540 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0;
544 char *kaddr;
545
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (!workspace)
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0;
556
557 workspace->inf_strm.next_out = workspace->buf;
558 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
559 workspace->inf_strm.total_out = 0;
560 /* If it's deflate, and it's got no preset dictionary, then
561 we can tell zlib to skip the adler32 check. */
562 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
563 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
564 !(((data_in[0]<<8) + data_in[1]) % 31)) {
565
566 wbits = -((data_in[0] >> 4) + 8);
567 workspace->inf_strm.next_in += 2;
568 workspace->inf_strm.avail_in -= 2;
569 }
570
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1;
574 goto out;
575 }
576
577 while (bytes_left > 0) {
578 unsigned long buf_start;
579 unsigned long buf_offset;
580 unsigned long bytes;
581 unsigned long pg_offset = 0;
582
583 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
584 if (ret != Z_OK && ret != Z_STREAM_END)
585 break;
586
587 buf_start = total_out;
588 total_out = workspace->inf_strm.total_out;
589
590 if (total_out == buf_start) {
591 ret = -1;
592 break;
593 }
594
595 if (total_out <= start_byte)
596 goto next;
597
598 if (total_out > start_byte && buf_start < start_byte)
599 buf_offset = start_byte - buf_start;
600 else
601 buf_offset = 0;
602
603 bytes = min(PAGE_CACHE_SIZE - pg_offset,
604 PAGE_CACHE_SIZE - buf_offset);
605 bytes = min(bytes, bytes_left);
606
607 kaddr = kmap_atomic(dest_page, KM_USER0);
608 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
609 kunmap_atomic(kaddr, KM_USER0);
610
611 pg_offset += bytes;
612 bytes_left -= bytes;
613next:
614 workspace->inf_strm.next_out = workspace->buf;
615 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
616 }
617
618 if (ret != Z_STREAM_END && bytes_left != 0)
619 ret = -1;
620 else
621 ret = 0;
622
623 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret;
627}
628
629void btrfs_zlib_exit(void)
630{
631 free_workspaces();
632}
diff --git a/fs/buffer.c b/fs/buffer.c
index c26da785938a..b6e8b8632e2f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
203 * happen on bdev until thaw_bdev() is called. 203 * happen on bdev until thaw_bdev() is called.
204 * If a superblock is found on this device, we take the s_umount semaphore 204 * If a superblock is found on this device, we take the s_umount semaphore
205 * on it to make sure nobody unmounts until the snapshot creation is done. 205 * on it to make sure nobody unmounts until the snapshot creation is done.
206 * The reference counter (bd_fsfreeze_count) guarantees that only the last
207 * unfreeze process can unfreeze the frozen filesystem actually when multiple
208 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
209 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
210 * actually.
206 */ 211 */
207struct super_block *freeze_bdev(struct block_device *bdev) 212struct super_block *freeze_bdev(struct block_device *bdev)
208{ 213{
209 struct super_block *sb; 214 struct super_block *sb;
215 int error = 0;
216
217 mutex_lock(&bdev->bd_fsfreeze_mutex);
218 if (bdev->bd_fsfreeze_count > 0) {
219 bdev->bd_fsfreeze_count++;
220 sb = get_super(bdev);
221 mutex_unlock(&bdev->bd_fsfreeze_mutex);
222 return sb;
223 }
224 bdev->bd_fsfreeze_count++;
210 225
211 down(&bdev->bd_mount_sem); 226 down(&bdev->bd_mount_sem);
212 sb = get_super(bdev); 227 sb = get_super(bdev);
@@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
221 236
222 sync_blockdev(sb->s_bdev); 237 sync_blockdev(sb->s_bdev);
223 238
224 if (sb->s_op->write_super_lockfs) 239 if (sb->s_op->freeze_fs) {
225 sb->s_op->write_super_lockfs(sb); 240 error = sb->s_op->freeze_fs(sb);
241 if (error) {
242 printk(KERN_ERR
243 "VFS:Filesystem freeze failed\n");
244 sb->s_frozen = SB_UNFROZEN;
245 drop_super(sb);
246 up(&bdev->bd_mount_sem);
247 bdev->bd_fsfreeze_count--;
248 mutex_unlock(&bdev->bd_fsfreeze_mutex);
249 return ERR_PTR(error);
250 }
251 }
226 } 252 }
227 253
228 sync_blockdev(bdev); 254 sync_blockdev(bdev);
255 mutex_unlock(&bdev->bd_fsfreeze_mutex);
256
229 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ 257 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
230} 258}
231EXPORT_SYMBOL(freeze_bdev); 259EXPORT_SYMBOL(freeze_bdev);
@@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
237 * 265 *
238 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 266 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
239 */ 267 */
240void thaw_bdev(struct block_device *bdev, struct super_block *sb) 268int thaw_bdev(struct block_device *bdev, struct super_block *sb)
241{ 269{
270 int error = 0;
271
272 mutex_lock(&bdev->bd_fsfreeze_mutex);
273 if (!bdev->bd_fsfreeze_count) {
274 mutex_unlock(&bdev->bd_fsfreeze_mutex);
275 return -EINVAL;
276 }
277
278 bdev->bd_fsfreeze_count--;
279 if (bdev->bd_fsfreeze_count > 0) {
280 if (sb)
281 drop_super(sb);
282 mutex_unlock(&bdev->bd_fsfreeze_mutex);
283 return 0;
284 }
285
242 if (sb) { 286 if (sb) {
243 BUG_ON(sb->s_bdev != bdev); 287 BUG_ON(sb->s_bdev != bdev);
244 288 if (!(sb->s_flags & MS_RDONLY)) {
245 if (sb->s_op->unlockfs) 289 if (sb->s_op->unfreeze_fs) {
246 sb->s_op->unlockfs(sb); 290 error = sb->s_op->unfreeze_fs(sb);
247 sb->s_frozen = SB_UNFROZEN; 291 if (error) {
248 smp_wmb(); 292 printk(KERN_ERR
249 wake_up(&sb->s_wait_unfrozen); 293 "VFS:Filesystem thaw failed\n");
294 sb->s_frozen = SB_FREEZE_TRANS;
295 bdev->bd_fsfreeze_count++;
296 mutex_unlock(&bdev->bd_fsfreeze_mutex);
297 return error;
298 }
299 }
300 sb->s_frozen = SB_UNFROZEN;
301 smp_wmb();
302 wake_up(&sb->s_wait_unfrozen);
303 }
250 drop_super(sb); 304 drop_super(sb);
251 } 305 }
252 306
253 up(&bdev->bd_mount_sem); 307 up(&bdev->bd_mount_sem);
308 mutex_unlock(&bdev->bd_fsfreeze_mutex);
309 return 0;
254} 310}
255EXPORT_SYMBOL(thaw_bdev); 311EXPORT_SYMBOL(thaw_bdev);
256 312
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
11 11
12#include "coda_int.h" 12#include "coda_int.h"
13 13
14#ifdef CONFIG_SYSCTL
14static struct ctl_table_header *fs_table_header; 15static struct ctl_table_header *fs_table_header;
16#endif
15 17
16static ctl_table coda_table[] = { 18static ctl_table coda_table[] = {
17 { 19 {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
41 {} 43 {}
42}; 44};
43 45
46#ifdef CONFIG_SYSCTL
44static ctl_table fs_table[] = { 47static ctl_table fs_table[] = {
45 { 48 {
46 .ctl_name = CTL_UNNUMBERED, 49 .ctl_name = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
50 }, 53 },
51 {} 54 {}
52}; 55};
53 56#endif
54 57
55void coda_sysctl_init(void) 58void coda_sysctl_init(void)
56{ 59{
diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a32..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
1567 spin_unlock(&dcache_lock); 1567 spin_unlock(&dcache_lock);
1568} 1568}
1569 1569
1570#define do_switch(x,y) do { \
1571 __typeof__ (x) __tmp = x; \
1572 x = y; y = __tmp; } while (0)
1573
1574/* 1570/*
1575 * When switching names, the actual string doesn't strictly have to 1571 * When switching names, the actual string doesn't strictly have to
1576 * be preserved in the target - because we're dropping the target 1572 * be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1589 /* 1585 /*
1590 * Both external: swap the pointers 1586 * Both external: swap the pointers
1591 */ 1587 */
1592 do_switch(target->d_name.name, dentry->d_name.name); 1588 swap(target->d_name.name, dentry->d_name.name);
1593 } else { 1589 } else {
1594 /* 1590 /*
1595 * dentry:internal, target:external. Steal target's 1591 * dentry:internal, target:external. Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1620 return; 1616 return;
1621 } 1617 }
1622 } 1618 }
1623 do_switch(dentry->d_name.len, target->d_name.len); 1619 swap(dentry->d_name.len, target->d_name.len);
1624} 1620}
1625 1621
1626/* 1622/*
@@ -1680,7 +1676,7 @@ already_unhashed:
1680 1676
1681 /* Switch the names.. */ 1677 /* Switch the names.. */
1682 switch_names(dentry, target); 1678 switch_names(dentry, target);
1683 do_switch(dentry->d_name.hash, target->d_name.hash); 1679 swap(dentry->d_name.hash, target->d_name.hash);
1684 1680
1685 /* ... and switch the parents */ 1681 /* ... and switch the parents */
1686 if (IS_ROOT(dentry)) { 1682 if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
1688 target->d_parent = target; 1684 target->d_parent = target;
1689 INIT_LIST_HEAD(&target->d_u.d_child); 1685 INIT_LIST_HEAD(&target->d_u.d_child);
1690 } else { 1686 } else {
1691 do_switch(dentry->d_parent, target->d_parent); 1687 swap(dentry->d_parent, target->d_parent);
1692 1688
1693 /* And add them back to the (new) parent lists */ 1689 /* And add them back to the (new) parent lists */
1694 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); 1690 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1789 struct dentry *dparent, *aparent; 1785 struct dentry *dparent, *aparent;
1790 1786
1791 switch_names(dentry, anon); 1787 switch_names(dentry, anon);
1792 do_switch(dentry->d_name.hash, anon->d_name.hash); 1788 swap(dentry->d_name.hash, anon->d_name.hash);
1793 1789
1794 dparent = dentry->d_parent; 1790 dparent = dentry->d_parent;
1795 aparent = anon->d_parent; 1791 aparent = anon->d_parent;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 2f107d1a6a45..1d1d27442235 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -25,19 +25,6 @@ static struct mutex debug_buf_lock;
25 25
26static struct dentry *dlm_root; 26static struct dentry *dlm_root;
27 27
28struct rsb_iter {
29 int entry;
30 int format;
31 int header;
32 struct dlm_ls *ls;
33 struct list_head *next;
34 struct dlm_rsb *rsb;
35};
36
37/*
38 * dump all rsb's in the lockspace hash table
39 */
40
41static char *print_lockmode(int mode) 28static char *print_lockmode(int mode)
42{ 29{
43 switch (mode) { 30 switch (mode) {
@@ -60,13 +47,13 @@ static char *print_lockmode(int mode)
60 } 47 }
61} 48}
62 49
63static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, 50static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
64 struct dlm_rsb *res) 51 struct dlm_rsb *res)
65{ 52{
66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); 53 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
67 54
68 if (lkb->lkb_status == DLM_LKSTS_CONVERT 55 if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
69 || lkb->lkb_status == DLM_LKSTS_WAITING) 56 lkb->lkb_status == DLM_LKSTS_WAITING)
70 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); 57 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
71 58
72 if (lkb->lkb_nodeid) { 59 if (lkb->lkb_nodeid) {
@@ -80,33 +67,42 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
80 if (lkb->lkb_wait_type) 67 if (lkb->lkb_wait_type)
81 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); 68 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
82 69
83 seq_printf(s, "\n"); 70 return seq_printf(s, "\n");
84} 71}
85 72
86static int print_format1(struct dlm_rsb *res, struct seq_file *s) 73static int print_format1(struct dlm_rsb *res, struct seq_file *s)
87{ 74{
88 struct dlm_lkb *lkb; 75 struct dlm_lkb *lkb;
89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; 76 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
77 int rv;
90 78
91 lock_rsb(res); 79 lock_rsb(res);
92 80
93 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); 81 rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
82 res, res->res_length);
83 if (rv)
84 goto out;
85
94 for (i = 0; i < res->res_length; i++) { 86 for (i = 0; i < res->res_length; i++) {
95 if (isprint(res->res_name[i])) 87 if (isprint(res->res_name[i]))
96 seq_printf(s, "%c", res->res_name[i]); 88 seq_printf(s, "%c", res->res_name[i]);
97 else 89 else
98 seq_printf(s, "%c", '.'); 90 seq_printf(s, "%c", '.');
99 } 91 }
92
100 if (res->res_nodeid > 0) 93 if (res->res_nodeid > 0)
101 seq_printf(s, "\" \nLocal Copy, Master is node %d\n", 94 rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
102 res->res_nodeid); 95 res->res_nodeid);
103 else if (res->res_nodeid == 0) 96 else if (res->res_nodeid == 0)
104 seq_printf(s, "\" \nMaster Copy\n"); 97 rv = seq_printf(s, "\" \nMaster Copy\n");
105 else if (res->res_nodeid == -1) 98 else if (res->res_nodeid == -1)
106 seq_printf(s, "\" \nLooking up master (lkid %x)\n", 99 rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n",
107 res->res_first_lkid); 100 res->res_first_lkid);
108 else 101 else
109 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid); 102 rv = seq_printf(s, "\" \nInvalid master %d\n",
103 res->res_nodeid);
104 if (rv)
105 goto out;
110 106
111 /* Print the LVB: */ 107 /* Print the LVB: */
112 if (res->res_lvbptr) { 108 if (res->res_lvbptr) {
@@ -119,52 +115,66 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
119 } 115 }
120 if (rsb_flag(res, RSB_VALNOTVALID)) 116 if (rsb_flag(res, RSB_VALNOTVALID))
121 seq_printf(s, " (INVALID)"); 117 seq_printf(s, " (INVALID)");
122 seq_printf(s, "\n"); 118 rv = seq_printf(s, "\n");
119 if (rv)
120 goto out;
123 } 121 }
124 122
125 root_list = !list_empty(&res->res_root_list); 123 root_list = !list_empty(&res->res_root_list);
126 recover_list = !list_empty(&res->res_recover_list); 124 recover_list = !list_empty(&res->res_recover_list);
127 125
128 if (root_list || recover_list) { 126 if (root_list || recover_list) {
129 seq_printf(s, "Recovery: root %d recover %d flags %lx " 127 rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
130 "count %d\n", root_list, recover_list, 128 "count %d\n", root_list, recover_list,
131 res->res_flags, res->res_recover_locks_count); 129 res->res_flags, res->res_recover_locks_count);
130 if (rv)
131 goto out;
132 } 132 }
133 133
134 /* Print the locks attached to this resource */ 134 /* Print the locks attached to this resource */
135 seq_printf(s, "Granted Queue\n"); 135 seq_printf(s, "Granted Queue\n");
136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) 136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
137 print_format1_lock(s, lkb, res); 137 rv = print_format1_lock(s, lkb, res);
138 if (rv)
139 goto out;
140 }
138 141
139 seq_printf(s, "Conversion Queue\n"); 142 seq_printf(s, "Conversion Queue\n");
140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) 143 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
141 print_format1_lock(s, lkb, res); 144 rv = print_format1_lock(s, lkb, res);
145 if (rv)
146 goto out;
147 }
142 148
143 seq_printf(s, "Waiting Queue\n"); 149 seq_printf(s, "Waiting Queue\n");
144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) 150 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
145 print_format1_lock(s, lkb, res); 151 rv = print_format1_lock(s, lkb, res);
152 if (rv)
153 goto out;
154 }
146 155
147 if (list_empty(&res->res_lookup)) 156 if (list_empty(&res->res_lookup))
148 goto out; 157 goto out;
149 158
150 seq_printf(s, "Lookup Queue\n"); 159 seq_printf(s, "Lookup Queue\n");
151 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { 160 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
152 seq_printf(s, "%08x %s", lkb->lkb_id, 161 rv = seq_printf(s, "%08x %s", lkb->lkb_id,
153 print_lockmode(lkb->lkb_rqmode)); 162 print_lockmode(lkb->lkb_rqmode));
154 if (lkb->lkb_wait_type) 163 if (lkb->lkb_wait_type)
155 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); 164 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
156 seq_printf(s, "\n"); 165 rv = seq_printf(s, "\n");
157 } 166 }
158 out: 167 out:
159 unlock_rsb(res); 168 unlock_rsb(res);
160 return 0; 169 return rv;
161} 170}
162 171
163static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, 172static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
164 struct dlm_rsb *r) 173 struct dlm_rsb *r)
165{ 174{
166 u64 xid = 0; 175 u64 xid = 0;
167 u64 us; 176 u64 us;
177 int rv;
168 178
169 if (lkb->lkb_flags & DLM_IFL_USER) { 179 if (lkb->lkb_flags & DLM_IFL_USER) {
170 if (lkb->lkb_ua) 180 if (lkb->lkb_ua)
@@ -177,69 +187,82 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
177 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us 187 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
178 r_nodeid r_len r_name */ 188 r_nodeid r_len r_name */
179 189
180 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", 190 rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
181 lkb->lkb_id, 191 lkb->lkb_id,
182 lkb->lkb_nodeid, 192 lkb->lkb_nodeid,
183 lkb->lkb_remid, 193 lkb->lkb_remid,
184 lkb->lkb_ownpid, 194 lkb->lkb_ownpid,
185 (unsigned long long)xid, 195 (unsigned long long)xid,
186 lkb->lkb_exflags, 196 lkb->lkb_exflags,
187 lkb->lkb_flags, 197 lkb->lkb_flags,
188 lkb->lkb_status, 198 lkb->lkb_status,
189 lkb->lkb_grmode, 199 lkb->lkb_grmode,
190 lkb->lkb_rqmode, 200 lkb->lkb_rqmode,
191 (unsigned long long)us, 201 (unsigned long long)us,
192 r->res_nodeid, 202 r->res_nodeid,
193 r->res_length, 203 r->res_length,
194 r->res_name); 204 r->res_name);
205 return rv;
195} 206}
196 207
197static int print_format2(struct dlm_rsb *r, struct seq_file *s) 208static int print_format2(struct dlm_rsb *r, struct seq_file *s)
198{ 209{
199 struct dlm_lkb *lkb; 210 struct dlm_lkb *lkb;
211 int rv = 0;
200 212
201 lock_rsb(r); 213 lock_rsb(r);
202 214
203 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) 215 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
204 print_format2_lock(s, lkb, r); 216 rv = print_format2_lock(s, lkb, r);
205 217 if (rv)
206 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) 218 goto out;
207 print_format2_lock(s, lkb, r); 219 }
208 220
209 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) 221 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
210 print_format2_lock(s, lkb, r); 222 rv = print_format2_lock(s, lkb, r);
223 if (rv)
224 goto out;
225 }
211 226
227 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
228 rv = print_format2_lock(s, lkb, r);
229 if (rv)
230 goto out;
231 }
232 out:
212 unlock_rsb(r); 233 unlock_rsb(r);
213 return 0; 234 return rv;
214} 235}
215 236
216static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, 237static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
217 int rsb_lookup) 238 int rsb_lookup)
218{ 239{
219 u64 xid = 0; 240 u64 xid = 0;
241 int rv;
220 242
221 if (lkb->lkb_flags & DLM_IFL_USER) { 243 if (lkb->lkb_flags & DLM_IFL_USER) {
222 if (lkb->lkb_ua) 244 if (lkb->lkb_ua)
223 xid = lkb->lkb_ua->xid; 245 xid = lkb->lkb_ua->xid;
224 } 246 }
225 247
226 seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", 248 rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
227 lkb->lkb_id, 249 lkb->lkb_id,
228 lkb->lkb_nodeid, 250 lkb->lkb_nodeid,
229 lkb->lkb_remid, 251 lkb->lkb_remid,
230 lkb->lkb_ownpid, 252 lkb->lkb_ownpid,
231 (unsigned long long)xid, 253 (unsigned long long)xid,
232 lkb->lkb_exflags, 254 lkb->lkb_exflags,
233 lkb->lkb_flags, 255 lkb->lkb_flags,
234 lkb->lkb_status, 256 lkb->lkb_status,
235 lkb->lkb_grmode, 257 lkb->lkb_grmode,
236 lkb->lkb_rqmode, 258 lkb->lkb_rqmode,
237 lkb->lkb_highbast, 259 lkb->lkb_highbast,
238 rsb_lookup, 260 rsb_lookup,
239 lkb->lkb_wait_type, 261 lkb->lkb_wait_type,
240 lkb->lkb_lvbseq, 262 lkb->lkb_lvbseq,
241 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), 263 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
242 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast)); 264 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
265 return rv;
243} 266}
244 267
245static int print_format3(struct dlm_rsb *r, struct seq_file *s) 268static int print_format3(struct dlm_rsb *r, struct seq_file *s)
@@ -247,18 +270,21 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
247 struct dlm_lkb *lkb; 270 struct dlm_lkb *lkb;
248 int i, lvblen = r->res_ls->ls_lvblen; 271 int i, lvblen = r->res_ls->ls_lvblen;
249 int print_name = 1; 272 int print_name = 1;
273 int rv;
250 274
251 lock_rsb(r); 275 lock_rsb(r);
252 276
253 seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", 277 rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
254 r, 278 r,
255 r->res_nodeid, 279 r->res_nodeid,
256 r->res_first_lkid, 280 r->res_first_lkid,
257 r->res_flags, 281 r->res_flags,
258 !list_empty(&r->res_root_list), 282 !list_empty(&r->res_root_list),
259 !list_empty(&r->res_recover_list), 283 !list_empty(&r->res_recover_list),
260 r->res_recover_locks_count, 284 r->res_recover_locks_count,
261 r->res_length); 285 r->res_length);
286 if (rv)
287 goto out;
262 288
263 for (i = 0; i < r->res_length; i++) { 289 for (i = 0; i < r->res_length; i++) {
264 if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) 290 if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
@@ -273,7 +299,9 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
273 else 299 else
274 seq_printf(s, " %02x", (unsigned char)r->res_name[i]); 300 seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
275 } 301 }
276 seq_printf(s, "\n"); 302 rv = seq_printf(s, "\n");
303 if (rv)
304 goto out;
277 305
278 if (!r->res_lvbptr) 306 if (!r->res_lvbptr)
279 goto do_locks; 307 goto do_locks;
@@ -282,344 +310,294 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
282 310
283 for (i = 0; i < lvblen; i++) 311 for (i = 0; i < lvblen; i++)
284 seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); 312 seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
285 seq_printf(s, "\n"); 313 rv = seq_printf(s, "\n");
314 if (rv)
315 goto out;
286 316
287 do_locks: 317 do_locks:
288 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) 318 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
289 print_format3_lock(s, lkb, 0); 319 rv = print_format3_lock(s, lkb, 0);
290 320 if (rv)
291 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) 321 goto out;
292 print_format3_lock(s, lkb, 0);
293
294 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
295 print_format3_lock(s, lkb, 0);
296
297 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
298 print_format3_lock(s, lkb, 1);
299
300 unlock_rsb(r);
301 return 0;
302}
303
304static int rsb_iter_next(struct rsb_iter *ri)
305{
306 struct dlm_ls *ls = ri->ls;
307 int i;
308
309 if (!ri->next) {
310 top:
311 /* Find the next non-empty hash bucket */
312 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
313 read_lock(&ls->ls_rsbtbl[i].lock);
314 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
315 ri->next = ls->ls_rsbtbl[i].list.next;
316 ri->rsb = list_entry(ri->next, struct dlm_rsb,
317 res_hashchain);
318 dlm_hold_rsb(ri->rsb);
319 read_unlock(&ls->ls_rsbtbl[i].lock);
320 break;
321 }
322 read_unlock(&ls->ls_rsbtbl[i].lock);
323 }
324 ri->entry = i;
325
326 if (ri->entry >= ls->ls_rsbtbl_size)
327 return 1;
328 } else {
329 struct dlm_rsb *old = ri->rsb;
330 i = ri->entry;
331 read_lock(&ls->ls_rsbtbl[i].lock);
332 ri->next = ri->next->next;
333 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
334 /* End of list - move to next bucket */
335 ri->next = NULL;
336 ri->entry++;
337 read_unlock(&ls->ls_rsbtbl[i].lock);
338 dlm_put_rsb(old);
339 goto top;
340 }
341 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
342 dlm_hold_rsb(ri->rsb);
343 read_unlock(&ls->ls_rsbtbl[i].lock);
344 dlm_put_rsb(old);
345 } 322 }
346 323
347 return 0; 324 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
348} 325 rv = print_format3_lock(s, lkb, 0);
349 326 if (rv)
350static void rsb_iter_free(struct rsb_iter *ri) 327 goto out;
351{
352 kfree(ri);
353}
354
355static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
356{
357 struct rsb_iter *ri;
358
359 ri = kzalloc(sizeof *ri, GFP_KERNEL);
360 if (!ri)
361 return NULL;
362
363 ri->ls = ls;
364 ri->entry = 0;
365 ri->next = NULL;
366 ri->format = 1;
367
368 if (rsb_iter_next(ri)) {
369 rsb_iter_free(ri);
370 return NULL;
371 } 328 }
372 329
373 return ri; 330 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
374} 331 rv = print_format3_lock(s, lkb, 0);
375 332 if (rv)
376static void *rsb_seq_start(struct seq_file *file, loff_t *pos) 333 goto out;
377{
378 struct rsb_iter *ri;
379 loff_t n = *pos;
380
381 ri = rsb_iter_init(file->private);
382 if (!ri)
383 return NULL;
384
385 while (n--) {
386 if (rsb_iter_next(ri)) {
387 rsb_iter_free(ri);
388 return NULL;
389 }
390 } 334 }
391 335
392 return ri; 336 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
393} 337 rv = print_format3_lock(s, lkb, 1);
394 338 if (rv)
395static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos) 339 goto out;
396{
397 struct rsb_iter *ri = iter_ptr;
398
399 (*pos)++;
400
401 if (rsb_iter_next(ri)) {
402 rsb_iter_free(ri);
403 return NULL;
404 } 340 }
405 341 out:
406 return ri; 342 unlock_rsb(r);
343 return rv;
407} 344}
408 345
409static void rsb_seq_stop(struct seq_file *file, void *iter_ptr) 346struct rsbtbl_iter {
410{ 347 struct dlm_rsb *rsb;
411 /* nothing for now */ 348 unsigned bucket;
412} 349 int format;
350 int header;
351};
413 352
414static int rsb_seq_show(struct seq_file *file, void *iter_ptr) 353/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
354 If the buffer is full, seq_printf can be called again, but it
355 does nothing and just returns -1. So, the these printing routines
356 periodically check the return value to avoid wasting too much time
357 trying to print to a full buffer. */
358
359static int table_seq_show(struct seq_file *seq, void *iter_ptr)
415{ 360{
416 struct rsb_iter *ri = iter_ptr; 361 struct rsbtbl_iter *ri = iter_ptr;
362 int rv = 0;
417 363
418 switch (ri->format) { 364 switch (ri->format) {
419 case 1: 365 case 1:
420 print_format1(ri->rsb, file); 366 rv = print_format1(ri->rsb, seq);
421 break; 367 break;
422 case 2: 368 case 2:
423 if (ri->header) { 369 if (ri->header) {
424 seq_printf(file, "id nodeid remid pid xid exflags " 370 seq_printf(seq, "id nodeid remid pid xid exflags "
425 "flags sts grmode rqmode time_ms " 371 "flags sts grmode rqmode time_ms "
426 "r_nodeid r_len r_name\n"); 372 "r_nodeid r_len r_name\n");
427 ri->header = 0; 373 ri->header = 0;
428 } 374 }
429 print_format2(ri->rsb, file); 375 rv = print_format2(ri->rsb, seq);
430 break; 376 break;
431 case 3: 377 case 3:
432 if (ri->header) { 378 if (ri->header) {
433 seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); 379 seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
434 ri->header = 0; 380 ri->header = 0;
435 } 381 }
436 print_format3(ri->rsb, file); 382 rv = print_format3(ri->rsb, seq);
437 break; 383 break;
438 } 384 }
439 385
440 return 0; 386 return rv;
441} 387}
442 388
443static struct seq_operations rsb_seq_ops = { 389static struct seq_operations format1_seq_ops;
444 .start = rsb_seq_start, 390static struct seq_operations format2_seq_ops;
445 .next = rsb_seq_next, 391static struct seq_operations format3_seq_ops;
446 .stop = rsb_seq_stop,
447 .show = rsb_seq_show,
448};
449 392
450static int rsb_open(struct inode *inode, struct file *file) 393static void *table_seq_start(struct seq_file *seq, loff_t *pos)
451{ 394{
452 struct seq_file *seq; 395 struct dlm_ls *ls = seq->private;
453 int ret; 396 struct rsbtbl_iter *ri;
454 397 struct dlm_rsb *r;
455 ret = seq_open(file, &rsb_seq_ops); 398 loff_t n = *pos;
456 if (ret) 399 unsigned bucket, entry;
457 return ret;
458
459 seq = file->private_data;
460 seq->private = inode->i_private;
461
462 return 0;
463}
464
465static const struct file_operations rsb_fops = {
466 .owner = THIS_MODULE,
467 .open = rsb_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
470 .release = seq_release
471};
472 400
473/* 401 bucket = n >> 32;
474 * Dump state in compact per-lock listing 402 entry = n & ((1LL << 32) - 1);
475 */
476 403
477static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos) 404 if (bucket >= ls->ls_rsbtbl_size)
478{ 405 return NULL;
479 struct rsb_iter *ri;
480 406
481 ri = kzalloc(sizeof *ri, GFP_KERNEL); 407 ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
482 if (!ri) 408 if (!ri)
483 return NULL; 409 return NULL;
484 410 if (n == 0)
485 ri->ls = ls;
486 ri->entry = 0;
487 ri->next = NULL;
488 ri->format = 2;
489
490 if (*pos == 0)
491 ri->header = 1; 411 ri->header = 1;
492 412 if (seq->op == &format1_seq_ops)
493 if (rsb_iter_next(ri)) { 413 ri->format = 1;
494 rsb_iter_free(ri); 414 if (seq->op == &format2_seq_ops)
495 return NULL; 415 ri->format = 2;
416 if (seq->op == &format3_seq_ops)
417 ri->format = 3;
418
419 spin_lock(&ls->ls_rsbtbl[bucket].lock);
420 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
421 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
422 res_hashchain) {
423 if (!entry--) {
424 dlm_hold_rsb(r);
425 ri->rsb = r;
426 ri->bucket = bucket;
427 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
428 return ri;
429 }
430 }
496 } 431 }
432 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
497 433
498 return ri; 434 /*
499} 435 * move to the first rsb in the next non-empty bucket
436 */
500 437
501static void *locks_seq_start(struct seq_file *file, loff_t *pos) 438 /* zero the entry */
502{ 439 n &= ~((1LL << 32) - 1);
503 struct rsb_iter *ri;
504 loff_t n = *pos;
505 440
506 ri = locks_iter_init(file->private, pos); 441 while (1) {
507 if (!ri) 442 bucket++;
508 return NULL; 443 n += 1LL << 32;
509 444
510 while (n--) { 445 if (bucket >= ls->ls_rsbtbl_size) {
511 if (rsb_iter_next(ri)) { 446 kfree(ri);
512 rsb_iter_free(ri);
513 return NULL; 447 return NULL;
514 } 448 }
515 }
516 449
517 return ri; 450 spin_lock(&ls->ls_rsbtbl[bucket].lock);
451 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
452 r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
453 struct dlm_rsb, res_hashchain);
454 dlm_hold_rsb(r);
455 ri->rsb = r;
456 ri->bucket = bucket;
457 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
458 *pos = n;
459 return ri;
460 }
461 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
462 }
518} 463}
519 464
520static struct seq_operations locks_seq_ops = { 465static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
521 .start = locks_seq_start,
522 .next = rsb_seq_next,
523 .stop = rsb_seq_stop,
524 .show = rsb_seq_show,
525};
526
527static int locks_open(struct inode *inode, struct file *file)
528{ 466{
529 struct seq_file *seq; 467 struct dlm_ls *ls = seq->private;
530 int ret; 468 struct rsbtbl_iter *ri = iter_ptr;
531 469 struct list_head *next;
532 ret = seq_open(file, &locks_seq_ops); 470 struct dlm_rsb *r, *rp;
533 if (ret) 471 loff_t n = *pos;
534 return ret; 472 unsigned bucket;
535 473
536 seq = file->private_data; 474 bucket = n >> 32;
537 seq->private = inode->i_private; 475
538 476 /*
539 return 0; 477 * move to the next rsb in the same bucket
540} 478 */
541 479
542static const struct file_operations locks_fops = { 480 spin_lock(&ls->ls_rsbtbl[bucket].lock);
543 .owner = THIS_MODULE, 481 rp = ri->rsb;
544 .open = locks_open, 482 next = rp->res_hashchain.next;
545 .read = seq_read, 483
546 .llseek = seq_lseek, 484 if (next != &ls->ls_rsbtbl[bucket].list) {
547 .release = seq_release 485 r = list_entry(next, struct dlm_rsb, res_hashchain);
548}; 486 dlm_hold_rsb(r);
549 487 ri->rsb = r;
550/* 488 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
551 * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks 489 dlm_put_rsb(rp);
552 * This can replace both formats 1 and 2 eventually. 490 ++*pos;
553 */ 491 return ri;
492 }
493 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
494 dlm_put_rsb(rp);
554 495
555static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos) 496 /*
556{ 497 * move to the first rsb in the next non-empty bucket
557 struct rsb_iter *ri; 498 */
558 499
559 ri = kzalloc(sizeof *ri, GFP_KERNEL); 500 /* zero the entry */
560 if (!ri) 501 n &= ~((1LL << 32) - 1);
561 return NULL;
562 502
563 ri->ls = ls; 503 while (1) {
564 ri->entry = 0; 504 bucket++;
565 ri->next = NULL; 505 n += 1LL << 32;
566 ri->format = 3;
567 506
568 if (*pos == 0) 507 if (bucket >= ls->ls_rsbtbl_size) {
569 ri->header = 1; 508 kfree(ri);
509 return NULL;
510 }
570 511
571 if (rsb_iter_next(ri)) { 512 spin_lock(&ls->ls_rsbtbl[bucket].lock);
572 rsb_iter_free(ri); 513 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
573 return NULL; 514 r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
515 struct dlm_rsb, res_hashchain);
516 dlm_hold_rsb(r);
517 ri->rsb = r;
518 ri->bucket = bucket;
519 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
520 *pos = n;
521 return ri;
522 }
523 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
574 } 524 }
575
576 return ri;
577} 525}
578 526
579static void *all_seq_start(struct seq_file *file, loff_t *pos) 527static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
580{ 528{
581 struct rsb_iter *ri; 529 struct rsbtbl_iter *ri = iter_ptr;
582 loff_t n = *pos;
583
584 ri = all_iter_init(file->private, pos);
585 if (!ri)
586 return NULL;
587 530
588 while (n--) { 531 if (ri) {
589 if (rsb_iter_next(ri)) { 532 dlm_put_rsb(ri->rsb);
590 rsb_iter_free(ri); 533 kfree(ri);
591 return NULL;
592 }
593 } 534 }
594
595 return ri;
596} 535}
597 536
598static struct seq_operations all_seq_ops = { 537static struct seq_operations format1_seq_ops = {
599 .start = all_seq_start, 538 .start = table_seq_start,
600 .next = rsb_seq_next, 539 .next = table_seq_next,
601 .stop = rsb_seq_stop, 540 .stop = table_seq_stop,
602 .show = rsb_seq_show, 541 .show = table_seq_show,
603}; 542};
604 543
605static int all_open(struct inode *inode, struct file *file) 544static struct seq_operations format2_seq_ops = {
545 .start = table_seq_start,
546 .next = table_seq_next,
547 .stop = table_seq_stop,
548 .show = table_seq_show,
549};
550
551static struct seq_operations format3_seq_ops = {
552 .start = table_seq_start,
553 .next = table_seq_next,
554 .stop = table_seq_stop,
555 .show = table_seq_show,
556};
557
558static const struct file_operations format1_fops;
559static const struct file_operations format2_fops;
560static const struct file_operations format3_fops;
561
562static int table_open(struct inode *inode, struct file *file)
606{ 563{
607 struct seq_file *seq; 564 struct seq_file *seq;
608 int ret; 565 int ret = -1;
566
567 if (file->f_op == &format1_fops)
568 ret = seq_open(file, &format1_seq_ops);
569 else if (file->f_op == &format2_fops)
570 ret = seq_open(file, &format2_seq_ops);
571 else if (file->f_op == &format3_fops)
572 ret = seq_open(file, &format3_seq_ops);
609 573
610 ret = seq_open(file, &all_seq_ops);
611 if (ret) 574 if (ret)
612 return ret; 575 return ret;
613 576
614 seq = file->private_data; 577 seq = file->private_data;
615 seq->private = inode->i_private; 578 seq->private = inode->i_private; /* the dlm_ls */
616
617 return 0; 579 return 0;
618} 580}
619 581
620static const struct file_operations all_fops = { 582static const struct file_operations format1_fops = {
583 .owner = THIS_MODULE,
584 .open = table_open,
585 .read = seq_read,
586 .llseek = seq_lseek,
587 .release = seq_release
588};
589
590static const struct file_operations format2_fops = {
591 .owner = THIS_MODULE,
592 .open = table_open,
593 .read = seq_read,
594 .llseek = seq_lseek,
595 .release = seq_release
596};
597
598static const struct file_operations format3_fops = {
621 .owner = THIS_MODULE, 599 .owner = THIS_MODULE,
622 .open = all_open, 600 .open = table_open,
623 .read = seq_read, 601 .read = seq_read,
624 .llseek = seq_lseek, 602 .llseek = seq_lseek,
625 .release = seq_release 603 .release = seq_release
@@ -689,7 +667,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
689 S_IFREG | S_IRUGO, 667 S_IFREG | S_IRUGO,
690 dlm_root, 668 dlm_root,
691 ls, 669 ls,
692 &rsb_fops); 670 &format1_fops);
693 if (!ls->ls_debug_rsb_dentry) 671 if (!ls->ls_debug_rsb_dentry)
694 goto fail; 672 goto fail;
695 673
@@ -702,7 +680,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
702 S_IFREG | S_IRUGO, 680 S_IFREG | S_IRUGO,
703 dlm_root, 681 dlm_root,
704 ls, 682 ls,
705 &locks_fops); 683 &format2_fops);
706 if (!ls->ls_debug_locks_dentry) 684 if (!ls->ls_debug_locks_dentry)
707 goto fail; 685 goto fail;
708 686
@@ -715,7 +693,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
715 S_IFREG | S_IRUGO, 693 S_IFREG | S_IRUGO,
716 dlm_root, 694 dlm_root,
717 ls, 695 ls,
718 &all_fops); 696 &format3_fops);
719 if (!ls->ls_debug_all_dentry) 697 if (!ls->ls_debug_all_dentry)
720 goto fail; 698 goto fail;
721 699
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index ef2f1e353966..076e86f38bc8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -105,7 +105,7 @@ struct dlm_dirtable {
105struct dlm_rsbtable { 105struct dlm_rsbtable {
106 struct list_head list; 106 struct list_head list;
107 struct list_head toss; 107 struct list_head toss;
108 rwlock_t lock; 108 spinlock_t lock;
109}; 109};
110 110
111struct dlm_lkbtable { 111struct dlm_lkbtable {
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6cfe65bbf4a2..01e7d39c5fba 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -412,9 +412,9 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
412 unsigned int flags, struct dlm_rsb **r_ret) 412 unsigned int flags, struct dlm_rsb **r_ret)
413{ 413{
414 int error; 414 int error;
415 write_lock(&ls->ls_rsbtbl[b].lock); 415 spin_lock(&ls->ls_rsbtbl[b].lock);
416 error = _search_rsb(ls, name, len, b, flags, r_ret); 416 error = _search_rsb(ls, name, len, b, flags, r_ret);
417 write_unlock(&ls->ls_rsbtbl[b].lock); 417 spin_unlock(&ls->ls_rsbtbl[b].lock);
418 return error; 418 return error;
419} 419}
420 420
@@ -478,16 +478,16 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
478 r->res_nodeid = nodeid; 478 r->res_nodeid = nodeid;
479 } 479 }
480 480
481 write_lock(&ls->ls_rsbtbl[bucket].lock); 481 spin_lock(&ls->ls_rsbtbl[bucket].lock);
482 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); 482 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
483 if (!error) { 483 if (!error) {
484 write_unlock(&ls->ls_rsbtbl[bucket].lock); 484 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
485 dlm_free_rsb(r); 485 dlm_free_rsb(r);
486 r = tmp; 486 r = tmp;
487 goto out; 487 goto out;
488 } 488 }
489 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); 489 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
490 write_unlock(&ls->ls_rsbtbl[bucket].lock); 490 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
491 error = 0; 491 error = 0;
492 out: 492 out:
493 *r_ret = r; 493 *r_ret = r;
@@ -530,9 +530,9 @@ static void put_rsb(struct dlm_rsb *r)
530 struct dlm_ls *ls = r->res_ls; 530 struct dlm_ls *ls = r->res_ls;
531 uint32_t bucket = r->res_bucket; 531 uint32_t bucket = r->res_bucket;
532 532
533 write_lock(&ls->ls_rsbtbl[bucket].lock); 533 spin_lock(&ls->ls_rsbtbl[bucket].lock);
534 kref_put(&r->res_ref, toss_rsb); 534 kref_put(&r->res_ref, toss_rsb);
535 write_unlock(&ls->ls_rsbtbl[bucket].lock); 535 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
536} 536}
537 537
538void dlm_put_rsb(struct dlm_rsb *r) 538void dlm_put_rsb(struct dlm_rsb *r)
@@ -967,7 +967,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
967 967
968 for (;;) { 968 for (;;) {
969 found = 0; 969 found = 0;
970 write_lock(&ls->ls_rsbtbl[b].lock); 970 spin_lock(&ls->ls_rsbtbl[b].lock);
971 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, 971 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
972 res_hashchain) { 972 res_hashchain) {
973 if (!time_after_eq(jiffies, r->res_toss_time + 973 if (!time_after_eq(jiffies, r->res_toss_time +
@@ -978,20 +978,20 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
978 } 978 }
979 979
980 if (!found) { 980 if (!found) {
981 write_unlock(&ls->ls_rsbtbl[b].lock); 981 spin_unlock(&ls->ls_rsbtbl[b].lock);
982 break; 982 break;
983 } 983 }
984 984
985 if (kref_put(&r->res_ref, kill_rsb)) { 985 if (kref_put(&r->res_ref, kill_rsb)) {
986 list_del(&r->res_hashchain); 986 list_del(&r->res_hashchain);
987 write_unlock(&ls->ls_rsbtbl[b].lock); 987 spin_unlock(&ls->ls_rsbtbl[b].lock);
988 988
989 if (is_master(r)) 989 if (is_master(r))
990 dir_remove(r); 990 dir_remove(r);
991 dlm_free_rsb(r); 991 dlm_free_rsb(r);
992 count++; 992 count++;
993 } else { 993 } else {
994 write_unlock(&ls->ls_rsbtbl[b].lock); 994 spin_unlock(&ls->ls_rsbtbl[b].lock);
995 log_error(ls, "tossed rsb in use %s", r->res_name); 995 log_error(ls, "tossed rsb in use %s", r->res_name);
996 } 996 }
997 } 997 }
@@ -4224,7 +4224,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
4224{ 4224{
4225 struct dlm_rsb *r, *r_ret = NULL; 4225 struct dlm_rsb *r, *r_ret = NULL;
4226 4226
4227 read_lock(&ls->ls_rsbtbl[bucket].lock); 4227 spin_lock(&ls->ls_rsbtbl[bucket].lock);
4228 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) { 4228 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
4229 if (!rsb_flag(r, RSB_LOCKS_PURGED)) 4229 if (!rsb_flag(r, RSB_LOCKS_PURGED))
4230 continue; 4230 continue;
@@ -4233,7 +4233,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
4233 r_ret = r; 4233 r_ret = r;
4234 break; 4234 break;
4235 } 4235 }
4236 read_unlock(&ls->ls_rsbtbl[bucket].lock); 4236 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
4237 return r_ret; 4237 return r_ret;
4238} 4238}
4239 4239
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8d86b7960f0d..aa32e5f02493 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -464,7 +464,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
464 for (i = 0; i < size; i++) { 464 for (i = 0; i < size; i++) {
465 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); 465 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
466 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); 466 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
467 rwlock_init(&ls->ls_rsbtbl[i].lock); 467 spin_lock_init(&ls->ls_rsbtbl[i].lock);
468 } 468 }
469 469
470 size = dlm_config.ci_lkbtbl_size; 470 size = dlm_config.ci_lkbtbl_size;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 80aba5bdd4a4..eda43f362616 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -726,7 +726,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
726 } 726 }
727 727
728 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 728 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
729 read_lock(&ls->ls_rsbtbl[i].lock); 729 spin_lock(&ls->ls_rsbtbl[i].lock);
730 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { 730 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
731 list_add(&r->res_root_list, &ls->ls_root_list); 731 list_add(&r->res_root_list, &ls->ls_root_list);
732 dlm_hold_rsb(r); 732 dlm_hold_rsb(r);
@@ -737,7 +737,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
737 but no other recovery steps should do anything with them. */ 737 but no other recovery steps should do anything with them. */
738 738
739 if (dlm_no_directory(ls)) { 739 if (dlm_no_directory(ls)) {
740 read_unlock(&ls->ls_rsbtbl[i].lock); 740 spin_unlock(&ls->ls_rsbtbl[i].lock);
741 continue; 741 continue;
742 } 742 }
743 743
@@ -745,7 +745,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
745 list_add(&r->res_root_list, &ls->ls_root_list); 745 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r); 746 dlm_hold_rsb(r);
747 } 747 }
748 read_unlock(&ls->ls_rsbtbl[i].lock); 748 spin_unlock(&ls->ls_rsbtbl[i].lock);
749 } 749 }
750 out: 750 out:
751 up_write(&ls->ls_root_sem); 751 up_write(&ls->ls_root_sem);
@@ -775,7 +775,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
775 int i; 775 int i;
776 776
777 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 777 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
778 write_lock(&ls->ls_rsbtbl[i].lock); 778 spin_lock(&ls->ls_rsbtbl[i].lock);
779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
780 res_hashchain) { 780 res_hashchain) {
781 if (dlm_no_directory(ls) || !is_master(r)) { 781 if (dlm_no_directory(ls) || !is_master(r)) {
@@ -783,7 +783,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
783 dlm_free_rsb(r); 783 dlm_free_rsb(r);
784 } 784 }
785 } 785 }
786 write_unlock(&ls->ls_rsbtbl[i].lock); 786 spin_unlock(&ls->ls_rsbtbl[i].lock);
787 } 787 }
788} 788}
789 789
diff --git a/fs/dquot.c b/fs/dquot.c
index 61bfff64e5af..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -2090,10 +2090,12 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2090 } 2090 }
2091 if (di->dqb_valid & QIF_BTIME) { 2091 if (di->dqb_valid & QIF_BTIME) {
2092 dm->dqb_btime = di->dqb_btime; 2092 dm->dqb_btime = di->dqb_btime;
2093 check_blim = 1;
2093 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2094 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2094 } 2095 }
2095 if (di->dqb_valid & QIF_ITIME) { 2096 if (di->dqb_valid & QIF_ITIME) {
2096 dm->dqb_itime = di->dqb_itime; 2097 dm->dqb_itime = di->dqb_itime;
2098 check_ilim = 1;
2097 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2099 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2098 } 2100 }
2099 2101
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a5..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
565 inode->i_blocks = 0; 565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567 memset(ei->i_data, 0, sizeof(ei->i_data)); 567 memset(ei->i_data, 0, sizeof(ei->i_data));
568 ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; 568 ei->i_flags =
569 if (S_ISLNK(mode)) 569 ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
570 ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
571 /* dirsync is only applied to directories */
572 if (!S_ISDIR(mode))
573 ei->i_flags &= ~EXT2_DIRSYNC_FL;
574 ei->i_faddr = 0; 570 ei->i_faddr = 0;
575 ei->i_frag_no = 0; 571 ei->i_frag_no = 0;
576 ei->i_frag_size = 0; 572 ei->i_frag_size = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb74..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
498 * ext2_splice_branch - splice the allocated branch onto inode. 498 * ext2_splice_branch - splice the allocated branch onto inode.
499 * @inode: owner 499 * @inode: owner
500 * @block: (logical) number of block we are adding 500 * @block: (logical) number of block we are adding
501 * @chain: chain of indirect blocks (with a missing link - see
502 * ext2_alloc_branch)
503 * @where: location of missing link 501 * @where: location of missing link
504 * @num: number of indirect blocks we are adding 502 * @num: number of indirect blocks we are adding
505 * @blks: number of direct blocks we are adding 503 * @blks: number of direct blocks we are adding
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
50 goto setflags_out; 50 goto setflags_out;
51 } 51 }
52 52
53 if (!S_ISDIR(inode->i_mode)) 53 flags = ext2_mask_flags(inode->i_mode, flags);
54 flags &= ~EXT2_DIRSYNC_FL;
55 54
56 mutex_lock(&inode->i_mutex); 55 mutex_lock(&inode->i_mutex);
57 /* Is it quota file? Do not allow user to mess with it */ 56 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
132 percpu_counter_destroy(&sbi->s_dirs_counter); 132 percpu_counter_destroy(&sbi->s_dirs_counter);
133 brelse (sbi->s_sbh); 133 brelse (sbi->s_sbh);
134 sb->s_fs_info = NULL; 134 sb->s_fs_info = NULL;
135 kfree(sbi->s_blockgroup_lock);
135 kfree(sbi); 136 kfree(sbi);
136 137
137 return; 138 return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
756 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 757 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
757 if (!sbi) 758 if (!sbi)
758 return -ENOMEM; 759 return -ENOMEM;
760
761 sbi->s_blockgroup_lock =
762 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
763 if (!sbi->s_blockgroup_lock) {
764 kfree(sbi);
765 return -ENOMEM;
766 }
759 sb->s_fs_info = sbi; 767 sb->s_fs_info = sbi;
760 sbi->s_sb_block = sb_block; 768 sbi->s_sb_block = sb_block;
761 769
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
983 printk ("EXT2-fs: not enough memory\n"); 991 printk ("EXT2-fs: not enough memory\n");
984 goto failed_mount; 992 goto failed_mount;
985 } 993 }
986 bgl_lock_init(&sbi->s_blockgroup_lock); 994 bgl_lock_init(sbi->s_blockgroup_lock);
987 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); 995 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
988 if (!sbi->s_debts) { 996 if (!sbi->s_debts) {
989 printk ("EXT2-fs: not enough memory\n"); 997 printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
49} 68}
50 69
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
52{ 71{
53 __u32 pad, val; 72 __u32 pad, val;
54 int i; 73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
96}
97
98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
99{
100 __u32 pad, val;
101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i=0; i < len; i++) { 110 for (i=0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11f..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
559 ei->i_dir_start_lookup = 0; 559 ei->i_dir_start_lookup = 0;
560 ei->i_disksize = 0; 560 ei->i_disksize = 0;
561 561
562 ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; 562 ei->i_flags =
563 if (S_ISLNK(mode)) 563 ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
564 ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
565 /* dirsync only applies to directories */
566 if (!S_ISDIR(mode))
567 ei->i_flags &= ~EXT3_DIRSYNC_FL;
568#ifdef EXT3_FRAGMENTS 564#ifdef EXT3_FRAGMENTS
569 ei->i_faddr = 0; 565 ei->i_faddr = 0;
570 ei->i_frag_no = 0; 566 ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
53 goto flags_out; 53 goto flags_out;
54 } 54 }
55 55
56 if (!S_ISDIR(inode->i_mode)) 56 flags = ext3_mask_flags(inode->i_mode, flags);
57 flags &= ~EXT3_DIRSYNC_FL;
58 57
59 mutex_lock(&inode->i_mutex); 58 mutex_lock(&inode->i_mutex);
60 /* Is it quota file? Do not allow user to mess with it */ 59 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1dd2abe6313e..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
368 goto fail; 364 goto fail;
369 } 365 }
370 hinfo->hash_version = root->info.hash_version; 366 hinfo->hash_version = root->info.hash_version;
367 if (hinfo->hash_version <= DX_HASH_TEA)
368 hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
371 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; 369 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
372 if (entry) 370 if (entry)
373 ext3fs_dirhash(entry->name, entry->len, hinfo); 371 ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
636 dir = dir_file->f_path.dentry->d_inode; 634 dir = dir_file->f_path.dentry->d_inode;
637 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { 635 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
638 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; 636 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
637 if (hinfo.hash_version <= DX_HASH_TEA)
638 hinfo.hash_version +=
639 EXT3_SB(dir->i_sb)->s_hash_unsigned;
639 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 640 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
640 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 641 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
641 start_hash, start_minor_hash); 642 start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1156 u32 hash2; 1157 u32 hash2;
1157 struct dx_map_entry *map; 1158 struct dx_map_entry *map;
1158 char *data1 = (*bh)->b_data, *data2; 1159 char *data1 = (*bh)->b_data, *data2;
1159 unsigned split, move, size, i; 1160 unsigned split, move, size;
1160 struct ext3_dir_entry_2 *de = NULL, *de2; 1161 struct ext3_dir_entry_2 *de = NULL, *de2;
1161 int err = 0; 1162 int err = 0, i;
1162 1163
1163 bh2 = ext3_append (handle, dir, &newblock, &err); 1164 bh2 = ext3_append (handle, dir, &newblock, &err);
1164 if (!(bh2)) { 1165 if (!(bh2)) {
@@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1398 1399
1399 /* Initialize as for dx_probe */ 1400 /* Initialize as for dx_probe */
1400 hinfo.hash_version = root->info.hash_version; 1401 hinfo.hash_version = root->info.hash_version;
1402 if (hinfo.hash_version <= DX_HASH_TEA)
1403 hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1401 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 1404 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1402 ext3fs_dirhash(name, namelen, &hinfo); 1405 ext3fs_dirhash(name, namelen, &hinfo);
1403 frame = frames; 1406 frame = frames;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c22d01467bd1..b70d90e08a3c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 48 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 49static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
50 unsigned int); 50 unsigned int);
51static void ext3_commit_super (struct super_block * sb, 51static int ext3_commit_super(struct super_block *sb,
52 struct ext3_super_block * es, 52 struct ext3_super_block *es,
53 int sync); 53 int sync);
54static void ext3_mark_recovery_complete(struct super_block * sb, 54static void ext3_mark_recovery_complete(struct super_block * sb,
55 struct ext3_super_block * es); 55 struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
60 char nbuf[16]); 60 char nbuf[16]);
61static int ext3_remount (struct super_block * sb, int * flags, char * data); 61static int ext3_remount (struct super_block * sb, int * flags, char * data);
62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); 62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
63static void ext3_unlockfs(struct super_block *sb); 63static int ext3_unfreeze(struct super_block *sb);
64static void ext3_write_super (struct super_block * sb); 64static void ext3_write_super (struct super_block * sb);
65static void ext3_write_super_lockfs(struct super_block *sb); 65static int ext3_freeze(struct super_block *sb);
66 66
67/* 67/*
68 * Wrappers for journal_start/end. 68 * Wrappers for journal_start/end.
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
439 ext3_blkdev_remove(sbi); 439 ext3_blkdev_remove(sbi);
440 } 440 }
441 sb->s_fs_info = NULL; 441 sb->s_fs_info = NULL;
442 kfree(sbi->s_blockgroup_lock);
442 kfree(sbi); 443 kfree(sbi);
443 return; 444 return;
444} 445}
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
682 ext3_nfs_get_inode); 683 ext3_nfs_get_inode);
683} 684}
684 685
686/*
687 * Try to release metadata pages (indirect blocks, directories) which are
688 * mapped via the block device. Since these pages could have journal heads
689 * which would prevent try_to_free_buffers() from freeing them, we must use
690 * jbd layer's try_to_free_buffers() function to release them.
691 */
692static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
693 gfp_t wait)
694{
695 journal_t *journal = EXT3_SB(sb)->s_journal;
696
697 WARN_ON(PageChecked(page));
698 if (!page_has_buffers(page))
699 return 0;
700 if (journal)
701 return journal_try_to_free_buffers(journal, page,
702 wait & ~__GFP_WAIT);
703 return try_to_free_buffers(page);
704}
705
685#ifdef CONFIG_QUOTA 706#ifdef CONFIG_QUOTA
686#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 707#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
687#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 708#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -738,8 +759,8 @@ static const struct super_operations ext3_sops = {
738 .put_super = ext3_put_super, 759 .put_super = ext3_put_super,
739 .write_super = ext3_write_super, 760 .write_super = ext3_write_super,
740 .sync_fs = ext3_sync_fs, 761 .sync_fs = ext3_sync_fs,
741 .write_super_lockfs = ext3_write_super_lockfs, 762 .freeze_fs = ext3_freeze,
742 .unlockfs = ext3_unlockfs, 763 .unfreeze_fs = ext3_unfreeze,
743 .statfs = ext3_statfs, 764 .statfs = ext3_statfs,
744 .remount_fs = ext3_remount, 765 .remount_fs = ext3_remount,
745 .clear_inode = ext3_clear_inode, 766 .clear_inode = ext3_clear_inode,
@@ -748,6 +769,7 @@ static const struct super_operations ext3_sops = {
748 .quota_read = ext3_quota_read, 769 .quota_read = ext3_quota_read,
749 .quota_write = ext3_quota_write, 770 .quota_write = ext3_quota_write,
750#endif 771#endif
772 .bdev_try_to_free_page = bdev_try_to_free_page,
751}; 773};
752 774
753static const struct export_operations ext3_export_ops = { 775static const struct export_operations ext3_export_ops = {
@@ -1546,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1546 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 1568 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1547 if (!sbi) 1569 if (!sbi)
1548 return -ENOMEM; 1570 return -ENOMEM;
1571
1572 sbi->s_blockgroup_lock =
1573 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
1574 if (!sbi->s_blockgroup_lock) {
1575 kfree(sbi);
1576 return -ENOMEM;
1577 }
1549 sb->s_fs_info = sbi; 1578 sb->s_fs_info = sbi;
1550 sbi->s_mount_opt = 0; 1579 sbi->s_mount_opt = 0;
1551 sbi->s_resuid = EXT3_DEF_RESUID; 1580 sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1742,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1742 for (i=0; i < 4; i++) 1771 for (i=0; i < 4; i++)
1743 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 1772 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1744 sbi->s_def_hash_version = es->s_def_hash_version; 1773 sbi->s_def_hash_version = es->s_def_hash_version;
1774 i = le32_to_cpu(es->s_flags);
1775 if (i & EXT2_FLAGS_UNSIGNED_HASH)
1776 sbi->s_hash_unsigned = 3;
1777 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
1778#ifdef __CHAR_UNSIGNED__
1779 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
1780 sbi->s_hash_unsigned = 3;
1781#else
1782 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1783#endif
1784 sb->s_dirt = 1;
1785 }
1745 1786
1746 if (sbi->s_blocks_per_group > blocksize * 8) { 1787 if (sbi->s_blocks_per_group > blocksize * 8) {
1747 printk (KERN_ERR 1788 printk (KERN_ERR
@@ -1786,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1786 goto failed_mount; 1827 goto failed_mount;
1787 } 1828 }
1788 1829
1789 bgl_lock_init(&sbi->s_blockgroup_lock); 1830 bgl_lock_init(sbi->s_blockgroup_lock);
1790 1831
1791 for (i = 0; i < db_count; i++) { 1832 for (i = 0; i < db_count; i++) {
1792 block = descriptor_loc(sb, logic_sb_block, i); 1833 block = descriptor_loc(sb, logic_sb_block, i);
@@ -2270,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
2270 return 0; 2311 return 0;
2271} 2312}
2272 2313
2273static void ext3_commit_super (struct super_block * sb, 2314static int ext3_commit_super(struct super_block *sb,
2274 struct ext3_super_block * es, 2315 struct ext3_super_block *es,
2275 int sync) 2316 int sync)
2276{ 2317{
2277 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; 2318 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
2319 int error = 0;
2278 2320
2279 if (!sbh) 2321 if (!sbh)
2280 return; 2322 return error;
2281 es->s_wtime = cpu_to_le32(get_seconds()); 2323 es->s_wtime = cpu_to_le32(get_seconds());
2282 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); 2324 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2283 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2325 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2284 BUFFER_TRACE(sbh, "marking dirty"); 2326 BUFFER_TRACE(sbh, "marking dirty");
2285 mark_buffer_dirty(sbh); 2327 mark_buffer_dirty(sbh);
2286 if (sync) 2328 if (sync)
2287 sync_dirty_buffer(sbh); 2329 error = sync_dirty_buffer(sbh);
2330 return error;
2288} 2331}
2289 2332
2290 2333
@@ -2398,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
2398 * LVM calls this function before a (read-only) snapshot is created. This 2441 * LVM calls this function before a (read-only) snapshot is created. This
2399 * gives us a chance to flush the journal completely and mark the fs clean. 2442 * gives us a chance to flush the journal completely and mark the fs clean.
2400 */ 2443 */
2401static void ext3_write_super_lockfs(struct super_block *sb) 2444static int ext3_freeze(struct super_block *sb)
2402{ 2445{
2446 int error = 0;
2447 journal_t *journal;
2403 sb->s_dirt = 0; 2448 sb->s_dirt = 0;
2404 2449
2405 if (!(sb->s_flags & MS_RDONLY)) { 2450 if (!(sb->s_flags & MS_RDONLY)) {
2406 journal_t *journal = EXT3_SB(sb)->s_journal; 2451 journal = EXT3_SB(sb)->s_journal;
2407 2452
2408 /* Now we set up the journal barrier. */ 2453 /* Now we set up the journal barrier. */
2409 journal_lock_updates(journal); 2454 journal_lock_updates(journal);
@@ -2412,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
2412 * We don't want to clear needs_recovery flag when we failed 2457 * We don't want to clear needs_recovery flag when we failed
2413 * to flush the journal. 2458 * to flush the journal.
2414 */ 2459 */
2415 if (journal_flush(journal) < 0) 2460 error = journal_flush(journal);
2416 return; 2461 if (error < 0)
2462 goto out;
2417 2463
2418 /* Journal blocked and flushed, clear needs_recovery flag. */ 2464 /* Journal blocked and flushed, clear needs_recovery flag. */
2419 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2465 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2420 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); 2466 error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2467 if (error)
2468 goto out;
2421 } 2469 }
2470 return 0;
2471
2472out:
2473 journal_unlock_updates(journal);
2474 return error;
2422} 2475}
2423 2476
2424/* 2477/*
2425 * Called by LVM after the snapshot is done. We need to reset the RECOVER 2478 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2426 * flag here, even though the filesystem is not technically dirty yet. 2479 * flag here, even though the filesystem is not technically dirty yet.
2427 */ 2480 */
2428static void ext3_unlockfs(struct super_block *sb) 2481static int ext3_unfreeze(struct super_block *sb)
2429{ 2482{
2430 if (!(sb->s_flags & MS_RDONLY)) { 2483 if (!(sb->s_flags & MS_RDONLY)) {
2431 lock_super(sb); 2484 lock_super(sb);
@@ -2435,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
2435 unlock_super(sb); 2488 unlock_super(sb);
2436 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2489 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2437 } 2490 }
2491 return 0;
2438} 2492}
2439 2493
2440static int ext3_remount (struct super_block * sb, int * flags, char * data) 2494static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..6bba06b09dd1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "group.h" 22#include "group.h"
23#include "mballoc.h"
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
100 * essentially implementing a per-group read-only flag. */ 101 * essentially implementing a per-group read-only flag. */
101 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 102 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102 ext4_error(sb, __func__, 103 ext4_error(sb, __func__,
103 "Checksum bad for group %lu\n", block_group); 104 "Checksum bad for group %u", block_group);
104 gdp->bg_free_blocks_count = 0; 105 ext4_free_blks_set(sb, gdp, 0);
105 gdp->bg_free_inodes_count = 0; 106 ext4_free_inodes_set(sb, gdp, 0);
106 gdp->bg_itable_unused = 0; 107 ext4_itable_unused_set(sb, gdp, 0);
107 memset(bh->b_data, 0xff, sb->s_blocksize); 108 memset(bh->b_data, 0xff, sb->s_blocksize);
108 return 0; 109 return 0;
109 } 110 }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205 ext4_group_t block_group, 206 ext4_group_t block_group,
206 struct buffer_head **bh) 207 struct buffer_head **bh)
207{ 208{
208 unsigned long group_desc; 209 unsigned int group_desc;
209 unsigned long offset; 210 unsigned int offset;
210 struct ext4_group_desc *desc; 211 struct ext4_group_desc *desc;
211 struct ext4_sb_info *sbi = EXT4_SB(sb); 212 struct ext4_sb_info *sbi = EXT4_SB(sb);
212 213
213 if (block_group >= sbi->s_groups_count) { 214 if (block_group >= sbi->s_groups_count) {
214 ext4_error(sb, "ext4_get_group_desc", 215 ext4_error(sb, "ext4_get_group_desc",
215 "block_group >= groups_count - " 216 "block_group >= groups_count - "
216 "block_group = %lu, groups_count = %lu", 217 "block_group = %u, groups_count = %u",
217 block_group, sbi->s_groups_count); 218 block_group, sbi->s_groups_count);
218 219
219 return NULL; 220 return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
225 if (!sbi->s_group_desc[group_desc]) { 226 if (!sbi->s_group_desc[group_desc]) {
226 ext4_error(sb, "ext4_get_group_desc", 227 ext4_error(sb, "ext4_get_group_desc",
227 "Group descriptor not loaded - " 228 "Group descriptor not loaded - "
228 "block_group = %lu, group_desc = %lu, desc = %lu", 229 "block_group = %u, group_desc = %u, desc = %u",
229 block_group, group_desc, offset); 230 block_group, group_desc, offset);
230 return NULL; 231 return NULL;
231 } 232 }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
315 if (unlikely(!bh)) { 316 if (unlikely(!bh)) {
316 ext4_error(sb, __func__, 317 ext4_error(sb, __func__,
317 "Cannot read block bitmap - " 318 "Cannot read block bitmap - "
318 "block_group = %lu, block_bitmap = %llu", 319 "block_group = %u, block_bitmap = %llu",
319 block_group, bitmap_blk); 320 block_group, bitmap_blk);
320 return NULL; 321 return NULL;
321 } 322 }
322 if (buffer_uptodate(bh) && 323
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) 324 if (bitmap_uptodate(bh))
324 return bh; 325 return bh;
325 326
326 lock_buffer(bh); 327 lock_buffer(bh);
328 if (bitmap_uptodate(bh)) {
329 unlock_buffer(bh);
330 return bh;
331 }
327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 332 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 333 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
329 ext4_init_block_bitmap(sb, bh, block_group, desc); 334 ext4_init_block_bitmap(sb, bh, block_group, desc);
335 set_bitmap_uptodate(bh);
330 set_buffer_uptodate(bh); 336 set_buffer_uptodate(bh);
331 unlock_buffer(bh);
332 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 337 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
338 unlock_buffer(bh);
333 return bh; 339 return bh;
334 } 340 }
335 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 341 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
342 if (buffer_uptodate(bh)) {
343 /*
344 * if not uninit if bh is uptodate,
345 * bitmap is also uptodate
346 */
347 set_bitmap_uptodate(bh);
348 unlock_buffer(bh);
349 return bh;
350 }
351 /*
352 * submit the buffer_head for read. We can
353 * safely mark the bitmap as uptodate now.
354 * We do it here so the bitmap uptodate bit
355 * get set with buffer lock held.
356 */
357 set_bitmap_uptodate(bh);
336 if (bh_submit_read(bh) < 0) { 358 if (bh_submit_read(bh) < 0) {
337 put_bh(bh); 359 put_bh(bh);
338 ext4_error(sb, __func__, 360 ext4_error(sb, __func__,
339 "Cannot read block bitmap - " 361 "Cannot read block bitmap - "
340 "block_group = %lu, block_bitmap = %llu", 362 "block_group = %u, block_bitmap = %llu",
341 block_group, bitmap_blk); 363 block_group, bitmap_blk);
342 return NULL; 364 return NULL;
343 } 365 }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
350} 372}
351 373
352/** 374/**
353 * ext4_free_blocks_sb() -- Free given blocks and update quota 375 * ext4_add_groupblocks() -- Add given blocks to an existing group
354 * @handle: handle to this transaction 376 * @handle: handle to this transaction
355 * @sb: super block 377 * @sb: super block
356 * @block: start physcial block to free 378 * @block: start physcial block to add to the block group
357 * @count: number of blocks to free 379 * @count: number of blocks to free
358 * @pdquot_freed_blocks: pointer to quota
359 * 380 *
360 * XXX This function is only used by the on-line resizing code, which 381 * This marks the blocks as free in the bitmap. We ask the
361 * should probably be fixed up to call the mballoc variant. There 382 * mballoc to reload the buddy after this by setting group
362 * this needs to be cleaned up later; in fact, I'm not convinced this 383 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
366 */ 384 */
367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 385void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
368 ext4_fsblk_t block, unsigned long count, 386 ext4_fsblk_t block, unsigned long count)
369 unsigned long *pdquot_freed_blocks)
370{ 387{
371 struct buffer_head *bitmap_bh = NULL; 388 struct buffer_head *bitmap_bh = NULL;
372 struct buffer_head *gd_bh; 389 struct buffer_head *gd_bh;
373 ext4_group_t block_group; 390 ext4_group_t block_group;
374 ext4_grpblk_t bit; 391 ext4_grpblk_t bit;
375 unsigned long i; 392 unsigned int i;
376 unsigned long overflow;
377 struct ext4_group_desc *desc; 393 struct ext4_group_desc *desc;
378 struct ext4_super_block *es; 394 struct ext4_super_block *es;
379 struct ext4_sb_info *sbi; 395 struct ext4_sb_info *sbi;
380 int err = 0, ret; 396 int err = 0, ret, blk_free_count;
381 ext4_grpblk_t group_freed; 397 ext4_grpblk_t blocks_freed;
398 struct ext4_group_info *grp;
382 399
383 *pdquot_freed_blocks = 0;
384 sbi = EXT4_SB(sb); 400 sbi = EXT4_SB(sb);
385 es = sbi->s_es; 401 es = sbi->s_es;
386 if (block < le32_to_cpu(es->s_first_data_block) || 402 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
387 block + count < block ||
388 block + count > ext4_blocks_count(es)) {
389 ext4_error(sb, "ext4_free_blocks",
390 "Freeing blocks not in datazone - "
391 "block = %llu, count = %lu", block, count);
392 goto error_return;
393 }
394
395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
396 403
397do_more:
398 overflow = 0;
399 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 404 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
405 grp = ext4_get_group_info(sb, block_group);
400 /* 406 /*
401 * Check to see if we are freeing blocks across a group 407 * Check to see if we are freeing blocks across a group
402 * boundary. 408 * boundary.
403 */ 409 */
404 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 410 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
405 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 411 goto error_return;
406 count -= overflow;
407 } 412 }
408 brelse(bitmap_bh);
409 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 413 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
410 if (!bitmap_bh) 414 if (!bitmap_bh)
411 goto error_return; 415 goto error_return;
@@ -418,18 +422,17 @@ do_more:
418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 422 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
419 in_range(block + count - 1, ext4_inode_table(sb, desc), 423 in_range(block + count - 1, ext4_inode_table(sb, desc),
420 sbi->s_itb_per_group)) { 424 sbi->s_itb_per_group)) {
421 ext4_error(sb, "ext4_free_blocks", 425 ext4_error(sb, __func__,
422 "Freeing blocks in system zones - " 426 "Adding blocks in system zones - "
423 "Block = %llu, count = %lu", 427 "Block = %llu, count = %lu",
424 block, count); 428 block, count);
425 goto error_return; 429 goto error_return;
426 } 430 }
427 431
428 /* 432 /*
429 * We are about to start releasing blocks in the bitmap, 433 * We are about to add blocks to the bitmap,
430 * so we need undo access. 434 * so we need undo access.
431 */ 435 */
432 /* @@@ check errors */
433 BUFFER_TRACE(bitmap_bh, "getting undo access"); 436 BUFFER_TRACE(bitmap_bh, "getting undo access");
434 err = ext4_journal_get_undo_access(handle, bitmap_bh); 437 err = ext4_journal_get_undo_access(handle, bitmap_bh);
435 if (err) 438 if (err)
@@ -444,107 +447,55 @@ do_more:
444 err = ext4_journal_get_write_access(handle, gd_bh); 447 err = ext4_journal_get_write_access(handle, gd_bh);
445 if (err) 448 if (err)
446 goto error_return; 449 goto error_return;
447 450 /*
448 jbd_lock_bh_state(bitmap_bh); 451 * make sure we don't allow a parallel init on other groups in the
449 452 * same buddy cache
450 for (i = 0, group_freed = 0; i < count; i++) { 453 */
451 /* 454 down_write(&grp->alloc_sem);
452 * An HJ special. This is expensive... 455 for (i = 0, blocks_freed = 0; i < count; i++) {
453 */
454#ifdef CONFIG_JBD2_DEBUG
455 jbd_unlock_bh_state(bitmap_bh);
456 {
457 struct buffer_head *debug_bh;
458 debug_bh = sb_find_get_block(sb, block + i);
459 if (debug_bh) {
460 BUFFER_TRACE(debug_bh, "Deleted!");
461 if (!bh2jh(bitmap_bh)->b_committed_data)
462 BUFFER_TRACE(debug_bh,
463 "No commited data in bitmap");
464 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
465 __brelse(debug_bh);
466 }
467 }
468 jbd_lock_bh_state(bitmap_bh);
469#endif
470 if (need_resched()) {
471 jbd_unlock_bh_state(bitmap_bh);
472 cond_resched();
473 jbd_lock_bh_state(bitmap_bh);
474 }
475 /* @@@ This prevents newly-allocated data from being
476 * freed and then reallocated within the same
477 * transaction.
478 *
479 * Ideally we would want to allow that to happen, but to
480 * do so requires making jbd2_journal_forget() capable of
481 * revoking the queued write of a data block, which
482 * implies blocking on the journal lock. *forget()
483 * cannot block due to truncate races.
484 *
485 * Eventually we can fix this by making jbd2_journal_forget()
486 * return a status indicating whether or not it was able
487 * to revoke the buffer. On successful revoke, it is
488 * safe not to set the allocation bit in the committed
489 * bitmap, because we know that there is no outstanding
490 * activity on the buffer any more and so it is safe to
491 * reallocate it.
492 */
493 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
494 J_ASSERT_BH(bitmap_bh,
495 bh2jh(bitmap_bh)->b_committed_data != NULL);
496 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
497 bh2jh(bitmap_bh)->b_committed_data);
498
499 /*
500 * We clear the bit in the bitmap after setting the committed
501 * data bit, because this is the reverse order to that which
502 * the allocator uses.
503 */
504 BUFFER_TRACE(bitmap_bh, "clear bit"); 456 BUFFER_TRACE(bitmap_bh, "clear bit");
505 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 457 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
506 bit + i, bitmap_bh->b_data)) { 458 bit + i, bitmap_bh->b_data)) {
507 jbd_unlock_bh_state(bitmap_bh);
508 ext4_error(sb, __func__, 459 ext4_error(sb, __func__,
509 "bit already cleared for block %llu", 460 "bit already cleared for block %llu",
510 (ext4_fsblk_t)(block + i)); 461 (ext4_fsblk_t)(block + i));
511 jbd_lock_bh_state(bitmap_bh);
512 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 462 BUFFER_TRACE(bitmap_bh, "bit already cleared");
513 } else { 463 } else {
514 group_freed++; 464 blocks_freed++;
515 } 465 }
516 } 466 }
517 jbd_unlock_bh_state(bitmap_bh);
518
519 spin_lock(sb_bgl_lock(sbi, block_group)); 467 spin_lock(sb_bgl_lock(sbi, block_group));
520 le16_add_cpu(&desc->bg_free_blocks_count, group_freed); 468 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
469 ext4_free_blks_set(sb, desc, blk_free_count);
521 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 470 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
522 spin_unlock(sb_bgl_lock(sbi, block_group)); 471 spin_unlock(sb_bgl_lock(sbi, block_group));
523 percpu_counter_add(&sbi->s_freeblocks_counter, count); 472 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
524 473
525 if (sbi->s_log_groups_per_flex) { 474 if (sbi->s_log_groups_per_flex) {
526 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
527 spin_lock(sb_bgl_lock(sbi, flex_group)); 476 spin_lock(sb_bgl_lock(sbi, flex_group));
528 sbi->s_flex_groups[flex_group].free_blocks += count; 477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
529 spin_unlock(sb_bgl_lock(sbi, flex_group)); 478 spin_unlock(sb_bgl_lock(sbi, flex_group));
530 } 479 }
480 /*
481 * request to reload the buddy with the
482 * new bitmap information
483 */
484 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
485 ext4_mb_update_group_info(grp, blocks_freed);
486 up_write(&grp->alloc_sem);
531 487
532 /* We dirtied the bitmap block */ 488 /* We dirtied the bitmap block */
533 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 489 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
534 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 490 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
535 491
536 /* And the group descriptor block */ 492 /* And the group descriptor block */
537 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 493 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
538 ret = ext4_journal_dirty_metadata(handle, gd_bh); 494 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
539 if (!err) err = ret; 495 if (!err)
540 *pdquot_freed_blocks += group_freed; 496 err = ret;
541
542 if (overflow && !err) {
543 block += count;
544 count = overflow;
545 goto do_more;
546 }
547 sb->s_dirt = 1; 497 sb->s_dirt = 1;
498
548error_return: 499error_return:
549 brelse(bitmap_bh); 500 brelse(bitmap_bh);
550 ext4_std_error(sb, err); 501 ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
614 if (dirty_blocks < 0) { 565 if (dirty_blocks < 0) {
615 printk(KERN_CRIT "Dirty block accounting " 566 printk(KERN_CRIT "Dirty block accounting "
616 "went wrong %lld\n", 567 "went wrong %lld\n",
617 dirty_blocks); 568 (long long)dirty_blocks);
618 } 569 }
619 } 570 }
620 /* Check whether we have space after 571 /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
666 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 617 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
667} 618}
668 619
669#define EXT4_META_BLOCK 0x1
670
671static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
672 ext4_lblk_t iblock, ext4_fsblk_t goal,
673 unsigned long *count, int *errp, int flags)
674{
675 struct ext4_allocation_request ar;
676 ext4_fsblk_t ret;
677
678 memset(&ar, 0, sizeof(ar));
679 /* Fill with neighbour allocated blocks */
680
681 ar.inode = inode;
682 ar.goal = goal;
683 ar.len = *count;
684 ar.logical = iblock;
685
686 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
687 /* enable in-core preallocation for data block allocation */
688 ar.flags = EXT4_MB_HINT_DATA;
689 else
690 /* disable in-core preallocation for non-regular files */
691 ar.flags = 0;
692
693 ret = ext4_mb_new_blocks(handle, &ar, errp);
694 *count = ar.len;
695 return ret;
696}
697
698/* 620/*
699 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks 621 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
700 * 622 *
701 * @handle: handle to this transaction 623 * @handle: handle to this transaction
702 * @inode: file inode 624 * @inode: file inode
703 * @goal: given target block(filesystem wide) 625 * @goal: given target block(filesystem wide)
704 * @count: total number of blocks need 626 * @count: pointer to total number of blocks needed
705 * @errp: error code 627 * @errp: error code
706 * 628 *
707 * Return 1st allocated block numberon success, *count stores total account 629 * Return 1st allocated block number on success, *count stores total account
708 * error stores in errp pointer 630 * error stores in errp pointer
709 */ 631 */
710ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 632ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
711 ext4_fsblk_t goal, unsigned long *count, int *errp) 633 ext4_fsblk_t goal, unsigned long *count, int *errp)
712{ 634{
635 struct ext4_allocation_request ar;
713 ext4_fsblk_t ret; 636 ext4_fsblk_t ret;
714 ret = do_blk_alloc(handle, inode, 0, goal, 637
715 count, errp, EXT4_META_BLOCK); 638 memset(&ar, 0, sizeof(ar));
639 /* Fill with neighbour allocated blocks */
640 ar.inode = inode;
641 ar.goal = goal;
642 ar.len = count ? *count : 1;
643
644 ret = ext4_mb_new_blocks(handle, &ar, errp);
645 if (count)
646 *count = ar.len;
647
716 /* 648 /*
717 * Account for the allocated meta blocks 649 * Account for the allocated meta blocks
718 */ 650 */
719 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 651 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
720 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 652 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
721 EXT4_I(inode)->i_allocated_meta_blocks += *count; 653 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
722 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 654 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
723 } 655 }
724 return ret; 656 return ret;
725} 657}
726 658
727/*
728 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
729 *
730 * @handle: handle to this transaction
731 * @inode: file inode
732 * @goal: given target block(filesystem wide)
733 * @errp: error code
734 *
735 * Return allocated block number on success
736 */
737ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
738 ext4_fsblk_t goal, int *errp)
739{
740 unsigned long count = 1;
741 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
742}
743
744/*
745 * ext4_new_blocks() -- allocate data blocks
746 *
747 * @handle: handle to this transaction
748 * @inode: file inode
749 * @goal: given target block(filesystem wide)
750 * @count: total number of blocks need
751 * @errp: error code
752 *
753 * Return 1st allocated block numberon success, *count stores total account
754 * error stores in errp pointer
755 */
756
757ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
758 ext4_lblk_t iblock, ext4_fsblk_t goal,
759 unsigned long *count, int *errp)
760{
761 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
762}
763
764/** 659/**
765 * ext4_count_free_blocks() -- count filesystem free blocks 660 * ext4_count_free_blocks() -- count filesystem free blocks
766 * @sb: superblock 661 * @sb: superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
776#ifdef EXT4FS_DEBUG 671#ifdef EXT4FS_DEBUG
777 struct ext4_super_block *es; 672 struct ext4_super_block *es;
778 ext4_fsblk_t bitmap_count; 673 ext4_fsblk_t bitmap_count;
779 unsigned long x; 674 unsigned int x;
780 struct buffer_head *bitmap_bh = NULL; 675 struct buffer_head *bitmap_bh = NULL;
781 676
782 es = EXT4_SB(sb)->s_es; 677 es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
796 continue; 691 continue;
797 692
798 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 693 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
799 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 694 printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
800 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 695 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
801 bitmap_count += x; 696 bitmap_count += x;
802 } 697 }
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
812 gdp = ext4_get_group_desc(sb, i, NULL); 707 gdp = ext4_get_group_desc(sb, i, NULL);
813 if (!gdp) 708 if (!gdp)
814 continue; 709 continue;
815 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 710 desc_count += ext4_free_blks_count(sb, gdp);
816 } 711 }
817 712
818 return desc_count; 713 return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) 18unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i, sum = 0;
21 unsigned long sum = 0;
22 21
23 if (!map) 22 if (!map)
24 return 0; 23 return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head *bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error(dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%u, inode=%u, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
95 void *dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned int offset;
99 int i, stored; 99 int i, stored;
100 struct ext4_dir_entry_2 *de; 100 struct ext4_dir_entry_2 *de;
101 struct super_block *sb; 101 struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
405 sb = inode->i_sb; 405 sb = inode->i_sb;
406 406
407 if (!fname) { 407 if (!fname) {
408 printk(KERN_ERR "ext4: call_filldir: called with " 408 printk(KERN_ERR "EXT4-fs: call_filldir: called with "
409 "null fname?!?\n"); 409 "null fname?!?\n");
410 return 0; 410 return 0;
411 } 411 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c46c648430d..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
19#include <linux/types.h> 19#include <linux/types.h>
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h>
22#include "ext4_i.h" 23#include "ext4_i.h"
23 24
24/* 25/*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
94 /* phys. block for ^^^ */ 95 /* phys. block for ^^^ */
95 ext4_fsblk_t pright; 96 ext4_fsblk_t pright;
96 /* how many blocks we want to allocate */ 97 /* how many blocks we want to allocate */
97 unsigned long len; 98 unsigned int len;
98 /* flags. see above EXT4_MB_HINT_* */ 99 /* flags. see above EXT4_MB_HINT_* */
99 unsigned long flags; 100 unsigned int flags;
100}; 101};
101 102
102/* 103/*
@@ -156,12 +157,12 @@ struct ext4_group_desc
156 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ 157 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */
157 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ 158 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
158 __le32 bg_inode_table_lo; /* Inodes table block */ 159 __le32 bg_inode_table_lo; /* Inodes table block */
159 __le16 bg_free_blocks_count; /* Free blocks count */ 160 __le16 bg_free_blocks_count_lo;/* Free blocks count */
160 __le16 bg_free_inodes_count; /* Free inodes count */ 161 __le16 bg_free_inodes_count_lo;/* Free inodes count */
161 __le16 bg_used_dirs_count; /* Directories count */ 162 __le16 bg_used_dirs_count_lo; /* Directories count */
162 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ 163 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
163 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ 164 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
164 __le16 bg_itable_unused; /* Unused inodes count */ 165 __le16 bg_itable_unused_lo; /* Unused inodes count */
165 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ 166 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
166 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ 167 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
167 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ 168 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
169 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ 170 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
170 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ 171 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
171 __le16 bg_used_dirs_count_hi; /* Directories count MSB */ 172 __le16 bg_used_dirs_count_hi; /* Directories count MSB */
172 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ 173 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
173 __u32 bg_reserved2[3]; 174 __u32 bg_reserved2[3];
174}; 175};
175 176
@@ -328,6 +329,7 @@ struct ext4_mount_options {
328 uid_t s_resuid; 329 uid_t s_resuid;
329 gid_t s_resgid; 330 gid_t s_resgid;
330 unsigned long s_commit_interval; 331 unsigned long s_commit_interval;
332 u32 s_min_batch_time, s_max_batch_time;
331#ifdef CONFIG_QUOTA 333#ifdef CONFIG_QUOTA
332 int s_jquota_fmt; 334 int s_jquota_fmt;
333 char *s_qf_names[MAXQUOTAS]; 335 char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do { \
534#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 536#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
535#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 537#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
536#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 538#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
537#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
726 */ 727 */
727 728
728#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
729 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) 730 ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
730#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
731 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) 732 ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
732#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 733#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
733 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) 734 ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
734#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
735 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 736 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
736#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 737#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
806#define EXT4_DEFM_JMODE_WBACK 0x0060 807#define EXT4_DEFM_JMODE_WBACK 0x0060
807 808
808/* 809/*
810 * Default journal batch times
811 */
812#define EXT4_DEF_MIN_BATCH_TIME 0
813#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
814
815/*
809 * Structure of a directory entry 816 * Structure of a directory entry
810 */ 817 */
811#define EXT4_NAME_LEN 255 818#define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
891#define DX_HASH_LEGACY 0 898#define DX_HASH_LEGACY 0
892#define DX_HASH_HALF_MD4 1 899#define DX_HASH_HALF_MD4 1
893#define DX_HASH_TEA 2 900#define DX_HASH_TEA 2
901#define DX_HASH_LEGACY_UNSIGNED 3
902#define DX_HASH_HALF_MD4_UNSIGNED 4
903#define DX_HASH_TEA_UNSIGNED 5
894 904
895#ifdef __KERNEL__ 905#ifdef __KERNEL__
896 906
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
955#define ERR_BAD_DX_DIR -75000 965#define ERR_BAD_DX_DIR -75000
956 966
957void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 967void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
958 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 968 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
959 969
960extern struct proc_dir_entry *ext4_proc_root; 970extern struct proc_dir_entry *ext4_proc_root;
961 971
@@ -987,6 +997,9 @@ do { \
987# define ATTRIB_NORET __attribute__((noreturn)) 997# define ATTRIB_NORET __attribute__((noreturn))
988# define NORET_AND noreturn, 998# define NORET_AND noreturn,
989 999
1000/* bitmap.c */
1001extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
1002
990/* balloc.c */ 1003/* balloc.c */
991extern unsigned int ext4_block_group(struct super_block *sb, 1004extern unsigned int ext4_block_group(struct super_block *sb,
992 ext4_fsblk_t blocknr); 1005 ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
995extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 1008extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
996extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1009extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
997 ext4_group_t group); 1010 ext4_group_t group);
998extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
999 ext4_fsblk_t goal, int *errp);
1000extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1011extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1001 ext4_fsblk_t goal, unsigned long *count, int *errp); 1012 ext4_fsblk_t goal, unsigned long *count, int *errp);
1002extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1003 ext4_lblk_t iblock, ext4_fsblk_t goal,
1004 unsigned long *count, int *errp);
1005extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1013extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1006extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1014extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1007extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1015extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1008 ext4_fsblk_t block, unsigned long count, int metadata); 1016 ext4_fsblk_t block, unsigned long count, int metadata);
1009extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 1017extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1010 ext4_fsblk_t block, unsigned long count, 1018 ext4_fsblk_t block, unsigned long count);
1011 unsigned long *pdquot_freed_blocks);
1012extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1019extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1013extern void ext4_check_blocks_bitmap(struct super_block *); 1020extern void ext4_check_blocks_bitmap(struct super_block *);
1014extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1021extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1019/* dir.c */ 1026/* dir.c */
1020extern int ext4_check_dir_entry(const char *, struct inode *, 1027extern int ext4_check_dir_entry(const char *, struct inode *,
1021 struct ext4_dir_entry_2 *, 1028 struct ext4_dir_entry_2 *,
1022 struct buffer_head *, unsigned long); 1029 struct buffer_head *, unsigned int);
1023extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1030extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1024 __u32 minor_hash, 1031 __u32 minor_hash,
1025 struct ext4_dir_entry_2 *dirent); 1032 struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1039extern unsigned long ext4_count_free_inodes(struct super_block *); 1046extern unsigned long ext4_count_free_inodes(struct super_block *);
1040extern unsigned long ext4_count_dirs(struct super_block *); 1047extern unsigned long ext4_count_dirs(struct super_block *);
1041extern void ext4_check_inodes_bitmap(struct super_block *); 1048extern void ext4_check_inodes_bitmap(struct super_block *);
1042extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1043 1049
1044/* mballoc.c */ 1050/* mballoc.c */
1045extern long ext4_mb_stats; 1051extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
1054extern void exit_ext4_mballoc(void); 1060extern void exit_ext4_mballoc(void);
1055extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1061extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1056 unsigned long, unsigned long, int, unsigned long *); 1062 unsigned long, unsigned long, int, unsigned long *);
1057extern int ext4_mb_add_more_groupinfo(struct super_block *sb, 1063extern int ext4_mb_add_groupinfo(struct super_block *sb,
1058 ext4_group_t i, struct ext4_group_desc *desc); 1064 ext4_group_t i, struct ext4_group_desc *desc);
1059extern void ext4_mb_update_group_info(struct ext4_group_info *grp, 1065extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1060 ext4_grpblk_t add); 1066 ext4_grpblk_t add);
1061 1067extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1062 1068extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1069 ext4_group_t, int);
1063/* inode.c */ 1070/* inode.c */
1064int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1071int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1065 struct buffer_head *bh, ext4_fsblk_t blocknr); 1072 struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
1069 ext4_lblk_t, int, int *); 1076 ext4_lblk_t, int, int *);
1070int ext4_get_block(struct inode *inode, sector_t iblock, 1077int ext4_get_block(struct inode *inode, sector_t iblock,
1071 struct buffer_head *bh_result, int create); 1078 struct buffer_head *bh_result, int create);
1072int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1073 ext4_lblk_t iblock, unsigned long maxblocks,
1074 struct buffer_head *bh_result,
1075 int create, int extend_disksize);
1076 1079
1077extern struct inode *ext4_iget(struct super_block *, unsigned long); 1080extern struct inode *ext4_iget(struct super_block *, unsigned long);
1078extern int ext4_write_inode(struct inode *, int); 1081extern int ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1123 __attribute__ ((format (printf, 3, 4))); 1126 __attribute__ ((format (printf, 3, 4)));
1124extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1127extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1125 __attribute__ ((format (printf, 3, 4))); 1128 __attribute__ ((format (printf, 3, 4)));
1129extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
1130 const char *, const char *, ...)
1131 __attribute__ ((format (printf, 4, 5)));
1126extern void ext4_update_dynamic_rev(struct super_block *sb); 1132extern void ext4_update_dynamic_rev(struct super_block *sb);
1127extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1133extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1128 __u32 compat); 1134 __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
1136 struct ext4_group_desc *bg); 1142 struct ext4_group_desc *bg);
1137extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, 1143extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
1138 struct ext4_group_desc *bg); 1144 struct ext4_group_desc *bg);
1145extern __u32 ext4_free_blks_count(struct super_block *sb,
1146 struct ext4_group_desc *bg);
1147extern __u32 ext4_free_inodes_count(struct super_block *sb,
1148 struct ext4_group_desc *bg);
1149extern __u32 ext4_used_dirs_count(struct super_block *sb,
1150 struct ext4_group_desc *bg);
1151extern __u32 ext4_itable_unused_count(struct super_block *sb,
1152 struct ext4_group_desc *bg);
1139extern void ext4_block_bitmap_set(struct super_block *sb, 1153extern void ext4_block_bitmap_set(struct super_block *sb,
1140 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1154 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1141extern void ext4_inode_bitmap_set(struct super_block *sb, 1155extern void ext4_inode_bitmap_set(struct super_block *sb,
1142 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1156 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1143extern void ext4_inode_table_set(struct super_block *sb, 1157extern void ext4_inode_table_set(struct super_block *sb,
1144 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1158 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1159extern void ext4_free_blks_set(struct super_block *sb,
1160 struct ext4_group_desc *bg, __u32 count);
1161extern void ext4_free_inodes_set(struct super_block *sb,
1162 struct ext4_group_desc *bg, __u32 count);
1163extern void ext4_used_dirs_set(struct super_block *sb,
1164 struct ext4_group_desc *bg, __u32 count);
1165extern void ext4_itable_unused_set(struct super_block *sb,
1166 struct ext4_group_desc *bg, __u32 count);
1145 1167
1146static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 1168static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
1147{ 1169{
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1246 return ; 1268 return ;
1247} 1269}
1248 1270
1271struct ext4_group_info {
1272 unsigned long bb_state;
1273 struct rb_root bb_free_root;
1274 unsigned short bb_first_free;
1275 unsigned short bb_free;
1276 unsigned short bb_fragments;
1277 struct list_head bb_prealloc_list;
1278#ifdef DOUBLE_CHECK
1279 void *bb_bitmap;
1280#endif
1281 struct rw_semaphore alloc_sem;
1282 unsigned short bb_counters[];
1283};
1284
1285#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
1286#define EXT4_GROUP_INFO_LOCKED_BIT 1
1287
1288#define EXT4_MB_GRP_NEED_INIT(grp) \
1289 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1290
1291static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1292{
1293 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1294
1295 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1296}
1297
1298static inline void ext4_unlock_group(struct super_block *sb,
1299 ext4_group_t group)
1300{
1301 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1302
1303 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1304}
1305
1306static inline int ext4_is_group_locked(struct super_block *sb,
1307 ext4_group_t group)
1308{
1309 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1310
1311 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
1312 &(grinfo->bb_state));
1313}
1314
1249/* 1315/*
1250 * Inodes and files operations 1316 * Inodes and files operations
1251 */ 1317 */
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1271extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1337extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1272 int chunk); 1338 int chunk);
1273extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1339extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1274 ext4_lblk_t iblock, 1340 ext4_lblk_t iblock, unsigned int max_blocks,
1275 unsigned long max_blocks, struct buffer_head *bh_result, 1341 struct buffer_head *bh_result,
1276 int create, int extend_disksize); 1342 int create, int extend_disksize);
1277extern void ext4_ext_truncate(struct inode *); 1343extern void ext4_ext_truncate(struct inode *);
1278extern void ext4_ext_init(struct super_block *); 1344extern void ext4_ext_init(struct super_block *);
1279extern void ext4_ext_release(struct super_block *); 1345extern void ext4_ext_release(struct super_block *);
1280extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1346extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1281 loff_t len); 1347 loff_t len);
1282extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1348extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1283 sector_t block, unsigned long max_blocks, 1349 sector_t block, unsigned int max_blocks,
1284 struct buffer_head *bh, int create, 1350 struct buffer_head *bh, int create,
1285 int extend_disksize, int flag); 1351 int extend_disksize, int flag);
1352extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1353 __u64 start, __u64 len);
1354
1355/*
1356 * Add new method to test wether block and inode bitmaps are properly
1357 * initialized. With uninit_bg reading the block from disk is not enough
1358 * to mark the bitmap uptodate. We need to also zero-out the bitmap
1359 */
1360#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
1361
1362static inline int bitmap_uptodate(struct buffer_head *bh)
1363{
1364 return (buffer_uptodate(bh) &&
1365 test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
1366}
1367static inline void set_bitmap_uptodate(struct buffer_head *bh)
1368{
1369 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1370}
1371
1286#endif /* __KERNEL__ */ 1372#endif /* __KERNEL__ */
1287 1373
1288#endif /* _EXT4_H */ 1374#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); 194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
195} 195}
196 196
197static inline void ext4_ext_tree_changed(struct inode *inode)
198{
199 EXT4_I(inode)->i_ext_generation++;
200}
201
202static inline void 197static inline void
203ext4_ext_invalidate_cache(struct inode *inode) 198ext4_ext_invalidate_cache(struct inode *inode)
204{ 199{
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
31typedef __u32 ext4_lblk_t; 31typedef __u32 ext4_lblk_t;
32 32
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
100 */ 100 */
101 loff_t i_disksize; 101 loff_t i_disksize;
102 102
103 /* on-disk additional length */
104 __u16 i_extra_isize;
105
106 /* 103 /*
107 * i_data_sem is for serialising ext4_truncate() against 104 * i_data_sem is for serialising ext4_truncate() against
108 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's 105 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
117 struct inode vfs_inode; 114 struct inode vfs_inode;
118 struct jbd2_inode jinode; 115 struct jbd2_inode jinode;
119 116
120 unsigned long i_ext_generation;
121 struct ext4_ext_cache i_cached_extent; 117 struct ext4_ext_cache i_cached_extent;
122 /* 118 /*
123 * File creation time. Its function is same as that of 119 * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
130 spinlock_t i_prealloc_lock; 126 spinlock_t i_prealloc_lock;
131 127
132 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
133 unsigned long i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
134 unsigned long i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
135 unsigned long i_allocated_meta_blocks; 131 unsigned int i_allocated_meta_blocks;
136 unsigned short i_delalloc_reserved_flag; 132 unsigned short i_delalloc_reserved_flag;
133
134 /* on-disk additional length */
135 __u16 i_extra_isize;
136
137 spinlock_t i_block_reservation_lock; 137 spinlock_t i_block_reservation_lock;
138}; 138};
139 139
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
7int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 7int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh) 8 struct buffer_head *bh)
9{ 9{
10 int err = jbd2_journal_get_undo_access(handle, bh); 10 int err = 0;
11 if (err) 11
12 ext4_journal_abort_handle(where, __func__, bh, handle, err); 12 if (ext4_handle_valid(handle)) {
13 err = jbd2_journal_get_undo_access(handle, bh);
14 if (err)
15 ext4_journal_abort_handle(where, __func__, bh,
16 handle, err);
17 }
13 return err; 18 return err;
14} 19}
15 20
16int __ext4_journal_get_write_access(const char *where, handle_t *handle, 21int __ext4_journal_get_write_access(const char *where, handle_t *handle,
17 struct buffer_head *bh) 22 struct buffer_head *bh)
18{ 23{
19 int err = jbd2_journal_get_write_access(handle, bh); 24 int err = 0;
20 if (err) 25
21 ext4_journal_abort_handle(where, __func__, bh, handle, err); 26 if (ext4_handle_valid(handle)) {
27 err = jbd2_journal_get_write_access(handle, bh);
28 if (err)
29 ext4_journal_abort_handle(where, __func__, bh,
30 handle, err);
31 }
22 return err; 32 return err;
23} 33}
24 34
25int __ext4_journal_forget(const char *where, handle_t *handle, 35int __ext4_journal_forget(const char *where, handle_t *handle,
26 struct buffer_head *bh) 36 struct buffer_head *bh)
27{ 37{
28 int err = jbd2_journal_forget(handle, bh); 38 int err = 0;
29 if (err) 39
30 ext4_journal_abort_handle(where, __func__, bh, handle, err); 40 if (ext4_handle_valid(handle)) {
41 err = jbd2_journal_forget(handle, bh);
42 if (err)
43 ext4_journal_abort_handle(where, __func__, bh,
44 handle, err);
45 }
31 return err; 46 return err;
32} 47}
33 48
34int __ext4_journal_revoke(const char *where, handle_t *handle, 49int __ext4_journal_revoke(const char *where, handle_t *handle,
35 ext4_fsblk_t blocknr, struct buffer_head *bh) 50 ext4_fsblk_t blocknr, struct buffer_head *bh)
36{ 51{
37 int err = jbd2_journal_revoke(handle, blocknr, bh); 52 int err = 0;
38 if (err) 53
39 ext4_journal_abort_handle(where, __func__, bh, handle, err); 54 if (ext4_handle_valid(handle)) {
55 err = jbd2_journal_revoke(handle, blocknr, bh);
56 if (err)
57 ext4_journal_abort_handle(where, __func__, bh,
58 handle, err);
59 }
40 return err; 60 return err;
41} 61}
42 62
43int __ext4_journal_get_create_access(const char *where, 63int __ext4_journal_get_create_access(const char *where,
44 handle_t *handle, struct buffer_head *bh) 64 handle_t *handle, struct buffer_head *bh)
45{ 65{
46 int err = jbd2_journal_get_create_access(handle, bh); 66 int err = 0;
47 if (err) 67
48 ext4_journal_abort_handle(where, __func__, bh, handle, err); 68 if (ext4_handle_valid(handle)) {
69 err = jbd2_journal_get_create_access(handle, bh);
70 if (err)
71 ext4_journal_abort_handle(where, __func__, bh,
72 handle, err);
73 }
49 return err; 74 return err;
50} 75}
51 76
52int __ext4_journal_dirty_metadata(const char *where, 77int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
53 handle_t *handle, struct buffer_head *bh) 78 struct inode *inode, struct buffer_head *bh)
54{ 79{
55 int err = jbd2_journal_dirty_metadata(handle, bh); 80 int err = 0;
56 if (err) 81
57 ext4_journal_abort_handle(where, __func__, bh, handle, err); 82 if (ext4_handle_valid(handle)) {
83 err = jbd2_journal_dirty_metadata(handle, bh);
84 if (err)
85 ext4_journal_abort_handle(where, __func__, bh,
86 handle, err);
87 } else {
88 mark_buffer_dirty(bh);
89 if (inode && inode_needs_sync(inode)) {
90 sync_dirty_buffer(bh);
91 if (buffer_req(bh) && !buffer_uptodate(bh)) {
92 ext4_error(inode->i_sb, __func__,
93 "IO error syncing inode, "
94 "inode=%lu, block=%llu",
95 inode->i_ino,
96 (unsigned long long) bh->b_blocknr);
97 err = -EIO;
98 }
99 }
100 }
58 return err; 101 return err;
59} 102}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree + root which are stored in the inode. */
33 33
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 || test_opt(sb, EXTENTS) ? 27U : 8U) 36 ? 27U : 8U)
37 37
38/* Extended attribute operations touch at most two data buffers, 38/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 39 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122 * been done yet. 122 * been done yet.
123 */ 123 */
124 124
125static inline void ext4_journal_release_buffer(handle_t *handle,
126 struct buffer_head *bh)
127{
128 jbd2_journal_release_buffer(handle, bh);
129}
130
131void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, const char *err_fn,
132 struct buffer_head *bh, handle_t *handle, int err); 126 struct buffer_head *bh, handle_t *handle, int err);
133 127
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
146int __ext4_journal_get_create_access(const char *where, 140int __ext4_journal_get_create_access(const char *where,
147 handle_t *handle, struct buffer_head *bh); 141 handle_t *handle, struct buffer_head *bh);
148 142
149int __ext4_journal_dirty_metadata(const char *where, 143int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
150 handle_t *handle, struct buffer_head *bh); 144 struct inode *inode, struct buffer_head *bh);
151 145
152#define ext4_journal_get_undo_access(handle, bh) \ 146#define ext4_journal_get_undo_access(handle, bh) \
153 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 147 __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
157 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 151 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
158#define ext4_journal_get_create_access(handle, bh) \ 152#define ext4_journal_get_create_access(handle, bh) \
159 __ext4_journal_get_create_access(__func__, (handle), (bh)) 153 __ext4_journal_get_create_access(__func__, (handle), (bh))
160#define ext4_journal_dirty_metadata(handle, bh) \
161 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
162#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
163 __ext4_journal_forget(__func__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156#define ext4_handle_dirty_metadata(handle, inode, bh) \
157 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
164 158
165handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
166int __ext4_journal_stop(const char *where, handle_t *handle); 160int __ext4_journal_stop(const char *where, handle_t *handle);
167 161
162#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
163
164static inline int ext4_handle_valid(handle_t *handle)
165{
166 if (handle == EXT4_NOJOURNAL_HANDLE)
167 return 0;
168 return 1;
169}
170
171static inline void ext4_handle_sync(handle_t *handle)
172{
173 if (ext4_handle_valid(handle))
174 handle->h_sync = 1;
175}
176
177static inline void ext4_handle_release_buffer(handle_t *handle,
178 struct buffer_head *bh)
179{
180 if (ext4_handle_valid(handle))
181 jbd2_journal_release_buffer(handle, bh);
182}
183
184static inline int ext4_handle_is_aborted(handle_t *handle)
185{
186 if (ext4_handle_valid(handle))
187 return is_handle_aborted(handle);
188 return 0;
189}
190
191static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
192{
193 if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
194 return 0;
195 return 1;
196}
197
198static inline void ext4_journal_release_buffer(handle_t *handle,
199 struct buffer_head *bh)
200{
201 if (ext4_handle_valid(handle))
202 jbd2_journal_release_buffer(handle, bh);
203}
204
168static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
169{ 206{
170 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
180 217
181static inline int ext4_journal_extend(handle_t *handle, int nblocks) 218static inline int ext4_journal_extend(handle_t *handle, int nblocks)
182{ 219{
183 return jbd2_journal_extend(handle, nblocks); 220 if (ext4_handle_valid(handle))
221 return jbd2_journal_extend(handle, nblocks);
222 return 0;
184} 223}
185 224
186static inline int ext4_journal_restart(handle_t *handle, int nblocks) 225static inline int ext4_journal_restart(handle_t *handle, int nblocks)
187{ 226{
188 return jbd2_journal_restart(handle, nblocks); 227 if (ext4_handle_valid(handle))
228 return jbd2_journal_restart(handle, nblocks);
229 return 0;
189} 230}
190 231
191static inline int ext4_journal_blocks_per_page(struct inode *inode) 232static inline int ext4_journal_blocks_per_page(struct inode *inode)
192{ 233{
193 return jbd2_journal_blocks_per_page(inode); 234 if (EXT4_JOURNAL(inode) != NULL)
235 return jbd2_journal_blocks_per_page(inode);
236 return 0;
194} 237}
195 238
196static inline int ext4_journal_force_commit(journal_t *journal) 239static inline int ext4_journal_force_commit(journal_t *journal)
197{ 240{
198 return jbd2_journal_force_commit(journal); 241 if (journal)
242 return jbd2_journal_force_commit(journal);
243 return 0;
199} 244}
200 245
201static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 246static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
202{ 247{
203 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 248 if (ext4_handle_valid(handle))
249 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
250 return 0;
204} 251}
205 252
206/* super.c */ 253/* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
208 255
209static inline int ext4_should_journal_data(struct inode *inode) 256static inline int ext4_should_journal_data(struct inode *inode)
210{ 257{
258 if (EXT4_JOURNAL(inode) == NULL)
259 return 0;
211 if (!S_ISREG(inode->i_mode)) 260 if (!S_ISREG(inode->i_mode))
212 return 1; 261 return 1;
213 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 262 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
219 268
220static inline int ext4_should_order_data(struct inode *inode) 269static inline int ext4_should_order_data(struct inode *inode)
221{ 270{
271 if (EXT4_JOURNAL(inode) == NULL)
272 return 0;
222 if (!S_ISREG(inode->i_mode)) 273 if (!S_ISREG(inode->i_mode))
223 return 0; 274 return 0;
224 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 275 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
230 281
231static inline int ext4_should_writeback_data(struct inode *inode) 282static inline int ext4_should_writeback_data(struct inode *inode)
232{ 283{
284 if (EXT4_JOURNAL(inode) == NULL)
285 return 0;
233 if (!S_ISREG(inode->i_mode)) 286 if (!S_ISREG(inode->i_mode))
234 return 0; 287 return 0;
235 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 288 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index b21f16713db0..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
57 u32 s_next_generation; 57 u32 s_next_generation;
58 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
59 int s_def_hash_version; 59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
60 struct percpu_counter s_freeblocks_counter; 61 struct percpu_counter s_freeblocks_counter;
61 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
62 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
73 struct journal_s *s_journal; 74 struct journal_s *s_journal;
74 struct list_head s_orphan; 75 struct list_head s_orphan;
75 unsigned long s_commit_interval; 76 unsigned long s_commit_interval;
77 u32 s_max_batch_time;
78 u32 s_min_batch_time;
76 struct block_device *journal_bdev; 79 struct block_device *journal_bdev;
77#ifdef CONFIG_JBD2_DEBUG 80#ifdef CONFIG_JBD2_DEBUG
78 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ 81 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
101 spinlock_t s_reserve_lock; 104 spinlock_t s_reserve_lock;
102 spinlock_t s_md_lock; 105 spinlock_t s_md_lock;
103 tid_t s_last_transaction; 106 tid_t s_last_transaction;
104 unsigned short *s_mb_offsets, *s_mb_maxs; 107 unsigned short *s_mb_offsets;
108 unsigned int *s_mb_maxs;
105 109
106 /* tunables */ 110 /* tunables */
107 unsigned long s_stripe; 111 unsigned long s_stripe;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
97{ 97{
98 int err; 98 int err;
99 99
100 if (!ext4_handle_valid(handle))
101 return 0;
100 if (handle->h_buffer_credits > needed) 102 if (handle->h_buffer_credits > needed)
101 return 0; 103 return 0;
102 err = ext4_journal_extend(handle, needed); 104 err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
134 int err; 136 int err;
135 if (path->p_bh) { 137 if (path->p_bh) {
136 /* path points to block */ 138 /* path points to block */
137 err = ext4_journal_dirty_metadata(handle, path->p_bh); 139 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
138 } else { 140 } else {
139 /* path points to leaf/index in inode body */ 141 /* path points to leaf/index in inode body */
140 err = ext4_mark_inode_dirty(handle, inode); 142 err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
191 ext4_fsblk_t goal, newblock; 193 ext4_fsblk_t goal, newblock;
192 194
193 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 195 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
194 newblock = ext4_new_meta_block(handle, inode, goal, err); 196 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
195 return newblock; 197 return newblock;
196} 198}
197 199
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
780 set_buffer_uptodate(bh); 782 set_buffer_uptodate(bh);
781 unlock_buffer(bh); 783 unlock_buffer(bh);
782 784
783 err = ext4_journal_dirty_metadata(handle, bh); 785 err = ext4_handle_dirty_metadata(handle, inode, bh);
784 if (err) 786 if (err)
785 goto cleanup; 787 goto cleanup;
786 brelse(bh); 788 brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
859 set_buffer_uptodate(bh); 861 set_buffer_uptodate(bh);
860 unlock_buffer(bh); 862 unlock_buffer(bh);
861 863
862 err = ext4_journal_dirty_metadata(handle, bh); 864 err = ext4_handle_dirty_metadata(handle, inode, bh);
863 if (err) 865 if (err)
864 goto cleanup; 866 goto cleanup;
865 brelse(bh); 867 brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
955 set_buffer_uptodate(bh); 957 set_buffer_uptodate(bh);
956 unlock_buffer(bh); 958 unlock_buffer(bh);
957 959
958 err = ext4_journal_dirty_metadata(handle, bh); 960 err = ext4_handle_dirty_metadata(handle, inode, bh);
959 if (err) 961 if (err)
960 goto out; 962 goto out;
961 963
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1160 while (--depth >= 0) { 1162 while (--depth >= 0) {
1161 ix = path[depth].p_idx; 1163 ix = path[depth].p_idx;
1162 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1164 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1163 break; 1165 goto got_index;
1164 } 1166 }
1165 1167
1166 if (depth < 0) { 1168 /* we've gone up to the root and found no index to the right */
1167 /* we've gone up to the root and 1169 return 0;
1168 * found no index to the right */
1169 return 0;
1170 }
1171 1170
1171got_index:
1172 /* we've found index to the right, let's 1172 /* we've found index to the right, let's
1173 * follow it and find the closest allocated 1173 * follow it and find the closest allocated
1174 * block to the right */ 1174 * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1201 *phys = ext_pblock(ex); 1201 *phys = ext_pblock(ex);
1202 put_bh(bh); 1202 put_bh(bh);
1203 return 0; 1203 return 0;
1204
1205} 1204}
1206 1205
1207/* 1206/*
@@ -1622,7 +1621,6 @@ cleanup:
1622 ext4_ext_drop_refs(npath); 1621 ext4_ext_drop_refs(npath);
1623 kfree(npath); 1622 kfree(npath);
1624 } 1623 }
1625 ext4_ext_tree_changed(inode);
1626 ext4_ext_invalidate_cache(inode); 1624 ext4_ext_invalidate_cache(inode);
1627 return err; 1625 return err;
1628} 1626}
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2233 } 2231 }
2234 } 2232 }
2235out: 2233out:
2236 ext4_ext_tree_changed(inode);
2237 ext4_ext_drop_refs(path); 2234 ext4_ext_drop_refs(path);
2238 kfree(path); 2235 kfree(path);
2239 ext4_journal_stop(handle); 2236 ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
2250 * possible initialization would be here 2247 * possible initialization would be here
2251 */ 2248 */
2252 2249
2253 if (test_opt(sb, EXTENTS)) { 2250 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2254 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2251 printk(KERN_INFO "EXT4-fs: file extents enabled");
2255#ifdef AGGRESSIVE_TEST 2252#ifdef AGGRESSIVE_TEST
2256 printk(", aggressive tests"); 2253 printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
2275 */ 2272 */
2276void ext4_ext_release(struct super_block *sb) 2273void ext4_ext_release(struct super_block *sb)
2277{ 2274{
2278 if (!test_opt(sb, EXTENTS)) 2275 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2279 return; 2276 return;
2280 2277
2281#ifdef EXTENTS_STATS 2278#ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2380 struct inode *inode, 2377 struct inode *inode,
2381 struct ext4_ext_path *path, 2378 struct ext4_ext_path *path,
2382 ext4_lblk_t iblock, 2379 ext4_lblk_t iblock,
2383 unsigned long max_blocks) 2380 unsigned int max_blocks)
2384{ 2381{
2385 struct ext4_extent *ex, newex, orig_ex; 2382 struct ext4_extent *ex, newex, orig_ex;
2386 struct ext4_extent *ex1 = NULL; 2383 struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2536 */ 2533 */
2537 newdepth = ext_depth(inode); 2534 newdepth = ext_depth(inode);
2538 /* 2535 /*
2539 * update the extent length after successfull insert of the 2536 * update the extent length after successful insert of the
2540 * split extent 2537 * split extent
2541 */ 2538 */
2542 orig_ex.ee_len = cpu_to_le16(ee_len - 2539 orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
2678 */ 2675 */
2679int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2676int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2680 ext4_lblk_t iblock, 2677 ext4_lblk_t iblock,
2681 unsigned long max_blocks, struct buffer_head *bh_result, 2678 unsigned int max_blocks, struct buffer_head *bh_result,
2682 int create, int extend_disksize) 2679 int create, int extend_disksize)
2683{ 2680{
2684 struct ext4_ext_path *path = NULL; 2681 struct ext4_ext_path *path = NULL;
2685 struct ext4_extent_header *eh; 2682 struct ext4_extent_header *eh;
2686 struct ext4_extent newex, *ex; 2683 struct ext4_extent newex, *ex;
2687 ext4_fsblk_t goal, newblock; 2684 ext4_fsblk_t newblock;
2688 int err = 0, depth, ret; 2685 int err = 0, depth, ret, cache_type;
2689 unsigned long allocated = 0; 2686 unsigned int allocated = 0;
2690 struct ext4_allocation_request ar; 2687 struct ext4_allocation_request ar;
2691 loff_t disksize; 2688 loff_t disksize;
2692 2689
2693 __clear_bit(BH_New, &bh_result->b_state); 2690 __clear_bit(BH_New, &bh_result->b_state);
2694 ext_debug("blocks %u/%lu requested for inode %u\n", 2691 ext_debug("blocks %u/%u requested for inode %u\n",
2695 iblock, max_blocks, inode->i_ino); 2692 iblock, max_blocks, inode->i_ino);
2696 2693
2697 /* check in cache */ 2694 /* check in cache */
2698 goal = ext4_ext_in_cache(inode, iblock, &newex); 2695 cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2699 if (goal) { 2696 if (cache_type) {
2700 if (goal == EXT4_EXT_CACHE_GAP) { 2697 if (cache_type == EXT4_EXT_CACHE_GAP) {
2701 if (!create) { 2698 if (!create) {
2702 /* 2699 /*
2703 * block isn't allocated yet and 2700 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2706 goto out2; 2703 goto out2;
2707 } 2704 }
2708 /* we should allocate requested block */ 2705 /* we should allocate requested block */
2709 } else if (goal == EXT4_EXT_CACHE_EXTENT) { 2706 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
2710 /* block is already allocated */ 2707 /* block is already allocated */
2711 newblock = iblock 2708 newblock = iblock
2712 - le32_to_cpu(newex.ee_block) 2709 - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2854 if (!newblock) 2851 if (!newblock)
2855 goto out2; 2852 goto out2;
2856 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2853 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2857 goal, newblock, allocated); 2854 ar.goal, newblock, allocated);
2858 2855
2859 /* try to insert new extent into found leaf and return */ 2856 /* try to insert new extent into found leaf and return */
2860 ext4_ext_store_pblock(&newex, newblock); 2857 ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
2950 * transaction synchronous. 2947 * transaction synchronous.
2951 */ 2948 */
2952 if (IS_SYNC(inode)) 2949 if (IS_SYNC(inode))
2953 handle->h_sync = 1; 2950 ext4_handle_sync(handle);
2954 2951
2955out_stop: 2952out_stop:
2956 up_write(&EXT4_I(inode)->i_data_sem); 2953 up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3004 handle_t *handle; 3001 handle_t *handle;
3005 ext4_lblk_t block; 3002 ext4_lblk_t block;
3006 loff_t new_size; 3003 loff_t new_size;
3007 unsigned long max_blocks; 3004 unsigned int max_blocks;
3008 int ret = 0; 3005 int ret = 0;
3009 int ret2 = 0; 3006 int ret2 = 0;
3010 int retries = 0; 3007 int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
3083/* 3080/*
3084 * Callback function called for each extent to gather FIEMAP information. 3081 * Callback function called for each extent to gather FIEMAP information.
3085 */ 3082 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3083static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3084 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data) 3085 void *data)
3089{ 3086{
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3152/* fiemap flags we can handle specified here */ 3149/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3150#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154 3151
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) 3152static int ext4_xattr_fiemap(struct inode *inode,
3153 struct fiemap_extent_info *fieinfo)
3156{ 3154{
3157 __u64 physical = 0; 3155 __u64 physical = 0;
3158 __u64 length; 3156 __u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
146const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
147 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
148 .read = do_sync_read, 145 .read = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash(const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
68}
69
70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
71{
72 __u32 pad, val;
73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
49} 96}
50 97
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
52{ 99{
53 __u32 pad, val; 100 __u32 pad, val;
54 int i; 101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i = 0; i < len; i++) { 110 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6e6052879aa2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
74 /* If checksum is bad mark all blocks and inodes use to prevent 74 /* If checksum is bad mark all blocks and inodes use to prevent
75 * allocation, essentially implementing a per-group read-only flag. */ 75 * allocation, essentially implementing a per-group read-only flag. */
76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
77 ext4_error(sb, __func__, "Checksum bad for group %lu\n", 77 ext4_error(sb, __func__, "Checksum bad for group %u",
78 block_group); 78 block_group);
79 gdp->bg_free_blocks_count = 0; 79 ext4_free_blks_set(sb, gdp, 0);
80 gdp->bg_free_inodes_count = 0; 80 ext4_free_inodes_set(sb, gdp, 0);
81 gdp->bg_itable_unused = 0; 81 ext4_itable_unused_set(sb, gdp, 0);
82 memset(bh->b_data, 0xff, sb->s_blocksize); 82 memset(bh->b_data, 0xff, sb->s_blocksize);
83 return 0; 83 return 0;
84 } 84 }
85 85
86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
88 bh->b_data); 88 bh->b_data);
89 89
90 return EXT4_INODES_PER_GROUP(sb); 90 return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 if (unlikely(!bh)) { 111 if (unlikely(!bh)) {
112 ext4_error(sb, __func__, 112 ext4_error(sb, __func__,
113 "Cannot read inode bitmap - " 113 "Cannot read inode bitmap - "
114 "block_group = %lu, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (buffer_uptodate(bh) && 118 if (bitmap_uptodate(bh))
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
120 return bh; 119 return bh;
121 120
122 lock_buffer(bh); 121 lock_buffer(bh);
122 if (bitmap_uptodate(bh)) {
123 unlock_buffer(bh);
124 return bh;
125 }
123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 126 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
125 ext4_init_inode_bitmap(sb, bh, block_group, desc); 128 ext4_init_inode_bitmap(sb, bh, block_group, desc);
129 set_bitmap_uptodate(bh);
126 set_buffer_uptodate(bh); 130 set_buffer_uptodate(bh);
127 unlock_buffer(bh);
128 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
132 unlock_buffer(bh);
129 return bh; 133 return bh;
130 } 134 }
131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 135 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
136 if (buffer_uptodate(bh)) {
137 /*
138 * if not uninit if bh is uptodate,
139 * bitmap is also uptodate
140 */
141 set_bitmap_uptodate(bh);
142 unlock_buffer(bh);
143 return bh;
144 }
145 /*
146 * submit the buffer_head for read. We can
147 * safely mark the bitmap as uptodate now.
148 * We do it here so the bitmap uptodate bit
149 * get set with buffer lock held.
150 */
151 set_bitmap_uptodate(bh);
132 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
133 put_bh(bh); 153 put_bh(bh);
134 ext4_error(sb, __func__, 154 ext4_error(sb, __func__,
135 "Cannot read inode bitmap - " 155 "Cannot read inode bitmap - "
136 "block_group = %lu, inode_bitmap = %llu", 156 "block_group = %u, inode_bitmap = %llu",
137 block_group, bitmap_blk); 157 block_group, bitmap_blk);
138 return NULL; 158 return NULL;
139 } 159 }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
168 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
169 struct ext4_super_block *es; 189 struct ext4_super_block *es;
170 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
171 int fatal = 0, err; 191 int fatal = 0, err, count;
172 ext4_group_t flex_group; 192 ext4_group_t flex_group;
173 193
174 if (atomic_read(&inode->i_count) > 1) { 194 if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
190 210
191 ino = inode->i_ino; 211 ino = inode->i_ino;
192 ext4_debug("freeing inode %lu\n", ino); 212 ext4_debug("freeing inode %lu\n", ino);
213 trace_mark(ext4_free_inode,
214 "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
215 sb->s_id, inode->i_ino, inode->i_mode,
216 (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
217 (unsigned long long) inode->i_blocks);
193 218
194 /* 219 /*
195 * Note: we must free any quota before locking the superblock, 220 * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
236 261
237 if (gdp) { 262 if (gdp) {
238 spin_lock(sb_bgl_lock(sbi, block_group)); 263 spin_lock(sb_bgl_lock(sbi, block_group));
239 le16_add_cpu(&gdp->bg_free_inodes_count, 1); 264 count = ext4_free_inodes_count(sb, gdp) + 1;
240 if (is_directory) 265 ext4_free_inodes_set(sb, gdp, count);
241 le16_add_cpu(&gdp->bg_used_dirs_count, -1); 266 if (is_directory) {
267 count = ext4_used_dirs_count(sb, gdp) - 1;
268 ext4_used_dirs_set(sb, gdp, count);
269 }
242 gdp->bg_checksum = ext4_group_desc_csum(sbi, 270 gdp->bg_checksum = ext4_group_desc_csum(sbi,
243 block_group, gdp); 271 block_group, gdp);
244 spin_unlock(sb_bgl_lock(sbi, block_group)); 272 spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
253 spin_unlock(sb_bgl_lock(sbi, flex_group)); 281 spin_unlock(sb_bgl_lock(sbi, flex_group));
254 } 282 }
255 } 283 }
256 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
257 err = ext4_journal_dirty_metadata(handle, bh2); 285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
258 if (!fatal) fatal = err; 286 if (!fatal) fatal = err;
259 } 287 }
260 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); 288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
261 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
262 if (!fatal) 290 if (!fatal)
263 fatal = err; 291 fatal = err;
264 sb->s_dirt = 1; 292 sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
291 319
292 for (group = 0; group < ngroups; group++) { 320 for (group = 0; group < ngroups; group++) {
293 desc = ext4_get_group_desc(sb, group, NULL); 321 desc = ext4_get_group_desc(sb, group, NULL);
294 if (!desc || !desc->bg_free_inodes_count) 322 if (!desc || !ext4_free_inodes_count(sb, desc))
295 continue; 323 continue;
296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 324 if (ext4_free_inodes_count(sb, desc) < avefreei)
297 continue; 325 continue;
298 if (!best_desc || 326 if (!best_desc ||
299 (le16_to_cpu(desc->bg_free_blocks_count) > 327 (ext4_free_blks_count(sb, desc) >
300 le16_to_cpu(best_desc->bg_free_blocks_count))) { 328 ext4_free_blks_count(sb, best_desc))) {
301 *best_group = group; 329 *best_group = group;
302 best_desc = desc; 330 best_desc = desc;
303 ret = 0; 331 ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
369 for (i = best_flex * flex_size; i < ngroups && 397 for (i = best_flex * flex_size; i < ngroups &&
370 i < (best_flex + 1) * flex_size; i++) { 398 i < (best_flex + 1) * flex_size; i++) {
371 desc = ext4_get_group_desc(sb, i, &bh); 399 desc = ext4_get_group_desc(sb, i, &bh);
372 if (le16_to_cpu(desc->bg_free_inodes_count)) { 400 if (ext4_free_inodes_count(sb, desc)) {
373 *best_group = i; 401 *best_group = i;
374 goto out; 402 goto out;
375 } 403 }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
443 for (i = 0; i < ngroups; i++) { 471 for (i = 0; i < ngroups; i++) {
444 grp = (parent_group + i) % ngroups; 472 grp = (parent_group + i) % ngroups;
445 desc = ext4_get_group_desc(sb, grp, NULL); 473 desc = ext4_get_group_desc(sb, grp, NULL);
446 if (!desc || !desc->bg_free_inodes_count) 474 if (!desc || !ext4_free_inodes_count(sb, desc))
447 continue; 475 continue;
448 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) 476 if (ext4_used_dirs_count(sb, desc) >= best_ndir)
449 continue; 477 continue;
450 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 478 if (ext4_free_inodes_count(sb, desc) < avefreei)
451 continue; 479 continue;
452 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) 480 if (ext4_free_blks_count(sb, desc) < avefreeb)
453 continue; 481 continue;
454 *group = grp; 482 *group = grp;
455 ret = 0; 483 ret = 0;
456 best_ndir = le16_to_cpu(desc->bg_used_dirs_count); 484 best_ndir = ext4_used_dirs_count(sb, desc);
457 } 485 }
458 if (ret == 0) 486 if (ret == 0)
459 return ret; 487 return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
479 for (i = 0; i < ngroups; i++) { 507 for (i = 0; i < ngroups; i++) {
480 *group = (parent_group + i) % ngroups; 508 *group = (parent_group + i) % ngroups;
481 desc = ext4_get_group_desc(sb, *group, NULL); 509 desc = ext4_get_group_desc(sb, *group, NULL);
482 if (!desc || !desc->bg_free_inodes_count) 510 if (!desc || !ext4_free_inodes_count(sb, desc))
483 continue; 511 continue;
484 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) 512 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
485 continue; 513 continue;
486 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) 514 if (ext4_free_inodes_count(sb, desc) < min_inodes)
487 continue; 515 continue;
488 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) 516 if (ext4_free_blks_count(sb, desc) < min_blocks)
489 continue; 517 continue;
490 return 0; 518 return 0;
491 } 519 }
@@ -494,8 +522,8 @@ fallback:
494 for (i = 0; i < ngroups; i++) { 522 for (i = 0; i < ngroups; i++) {
495 *group = (parent_group + i) % ngroups; 523 *group = (parent_group + i) % ngroups;
496 desc = ext4_get_group_desc(sb, *group, NULL); 524 desc = ext4_get_group_desc(sb, *group, NULL);
497 if (desc && desc->bg_free_inodes_count && 525 if (desc && ext4_free_inodes_count(sb, desc) &&
498 le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) 526 ext4_free_inodes_count(sb, desc) >= avefreei)
499 return 0; 527 return 0;
500 } 528 }
501 529
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
524 */ 552 */
525 *group = parent_group; 553 *group = parent_group;
526 desc = ext4_get_group_desc(sb, *group, NULL); 554 desc = ext4_get_group_desc(sb, *group, NULL);
527 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 555 if (desc && ext4_free_inodes_count(sb, desc) &&
528 le16_to_cpu(desc->bg_free_blocks_count)) 556 ext4_free_blks_count(sb, desc))
529 return 0; 557 return 0;
530 558
531 /* 559 /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
548 if (*group >= ngroups) 576 if (*group >= ngroups)
549 *group -= ngroups; 577 *group -= ngroups;
550 desc = ext4_get_group_desc(sb, *group, NULL); 578 desc = ext4_get_group_desc(sb, *group, NULL);
551 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 579 if (desc && ext4_free_inodes_count(sb, desc) &&
552 le16_to_cpu(desc->bg_free_blocks_count)) 580 ext4_free_blks_count(sb, desc))
553 return 0; 581 return 0;
554 } 582 }
555 583
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
562 if (++*group >= ngroups) 590 if (++*group >= ngroups)
563 *group = 0; 591 *group = 0;
564 desc = ext4_get_group_desc(sb, *group, NULL); 592 desc = ext4_get_group_desc(sb, *group, NULL);
565 if (desc && le16_to_cpu(desc->bg_free_inodes_count)) 593 if (desc && ext4_free_inodes_count(sb, desc))
566 return 0; 594 return 0;
567 } 595 }
568 596
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
570} 598}
571 599
572/* 600/*
601 * claim the inode from the inode bitmap. If the group
602 * is uninit we need to take the groups's sb_bgl_lock
603 * and clear the uninit flag. The inode bitmap update
604 * and group desc uninit flag clear should be done
605 * after holding sb_bgl_lock so that ext4_read_inode_bitmap
606 * doesn't race with the ext4_claim_inode
607 */
608static int ext4_claim_inode(struct super_block *sb,
609 struct buffer_head *inode_bitmap_bh,
610 unsigned long ino, ext4_group_t group, int mode)
611{
612 int free = 0, retval = 0, count;
613 struct ext4_sb_info *sbi = EXT4_SB(sb);
614 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
615
616 spin_lock(sb_bgl_lock(sbi, group));
617 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
618 /* not a free inode */
619 retval = 1;
620 goto err_ret;
621 }
622 ino++;
623 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
624 ino > EXT4_INODES_PER_GROUP(sb)) {
625 spin_unlock(sb_bgl_lock(sbi, group));
626 ext4_error(sb, __func__,
627 "reserved inode or inode > inodes count - "
628 "block_group = %u, inode=%lu", group,
629 ino + group * EXT4_INODES_PER_GROUP(sb));
630 return 1;
631 }
632 /* If we didn't allocate from within the initialized part of the inode
633 * table then we need to initialize up to this inode. */
634 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
635
636 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
637 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
638 /* When marking the block group with
639 * ~EXT4_BG_INODE_UNINIT we don't want to depend
640 * on the value of bg_itable_unused even though
641 * mke2fs could have initialized the same for us.
642 * Instead we calculated the value below
643 */
644
645 free = 0;
646 } else {
647 free = EXT4_INODES_PER_GROUP(sb) -
648 ext4_itable_unused_count(sb, gdp);
649 }
650
651 /*
652 * Check the relative inode number against the last used
653 * relative inode number in this group. if it is greater
654 * we need to update the bg_itable_unused count
655 *
656 */
657 if (ino > free)
658 ext4_itable_unused_set(sb, gdp,
659 (EXT4_INODES_PER_GROUP(sb) - ino));
660 }
661 count = ext4_free_inodes_count(sb, gdp) - 1;
662 ext4_free_inodes_set(sb, gdp, count);
663 if (S_ISDIR(mode)) {
664 count = ext4_used_dirs_count(sb, gdp) + 1;
665 ext4_used_dirs_set(sb, gdp, count);
666 }
667 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
668err_ret:
669 spin_unlock(sb_bgl_lock(sbi, group));
670 return retval;
671}
672
673/*
573 * There are two policies for allocating an inode. If the new inode is 674 * There are two policies for allocating an inode. If the new inode is
574 * a directory, then a forward search is made for a block group with both 675 * a directory, then a forward search is made for a block group with both
575 * free space and a low directory-to-inode ratio; if that fails, then of 676 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) 683struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
583{ 684{
584 struct super_block *sb; 685 struct super_block *sb;
585 struct buffer_head *bitmap_bh = NULL; 686 struct buffer_head *inode_bitmap_bh = NULL;
586 struct buffer_head *bh2; 687 struct buffer_head *group_desc_bh;
587 ext4_group_t group = 0; 688 ext4_group_t group = 0;
588 unsigned long ino = 0; 689 unsigned long ino = 0;
589 struct inode *inode; 690 struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
602 return ERR_PTR(-EPERM); 703 return ERR_PTR(-EPERM);
603 704
604 sb = dir->i_sb; 705 sb = dir->i_sb;
706 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
707 dir->i_ino, mode);
605 inode = new_inode(sb); 708 inode = new_inode(sb);
606 if (!inode) 709 if (!inode)
607 return ERR_PTR(-ENOMEM); 710 return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
631 for (i = 0; i < sbi->s_groups_count; i++) { 734 for (i = 0; i < sbi->s_groups_count; i++) {
632 err = -EIO; 735 err = -EIO;
633 736
634 gdp = ext4_get_group_desc(sb, group, &bh2); 737 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
635 if (!gdp) 738 if (!gdp)
636 goto fail; 739 goto fail;
637 740
638 brelse(bitmap_bh); 741 brelse(inode_bitmap_bh);
639 bitmap_bh = ext4_read_inode_bitmap(sb, group); 742 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
640 if (!bitmap_bh) 743 if (!inode_bitmap_bh)
641 goto fail; 744 goto fail;
642 745
643 ino = 0; 746 ino = 0;
644 747
645repeat_in_this_group: 748repeat_in_this_group:
646 ino = ext4_find_next_zero_bit((unsigned long *) 749 ino = ext4_find_next_zero_bit((unsigned long *)
647 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); 750 inode_bitmap_bh->b_data,
751 EXT4_INODES_PER_GROUP(sb), ino);
752
648 if (ino < EXT4_INODES_PER_GROUP(sb)) { 753 if (ino < EXT4_INODES_PER_GROUP(sb)) {
649 754
650 BUFFER_TRACE(bitmap_bh, "get_write_access"); 755 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
651 err = ext4_journal_get_write_access(handle, bitmap_bh); 756 err = ext4_journal_get_write_access(handle,
757 inode_bitmap_bh);
652 if (err) 758 if (err)
653 goto fail; 759 goto fail;
654 760
655 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), 761 BUFFER_TRACE(group_desc_bh, "get_write_access");
656 ino, bitmap_bh->b_data)) { 762 err = ext4_journal_get_write_access(handle,
763 group_desc_bh);
764 if (err)
765 goto fail;
766 if (!ext4_claim_inode(sb, inode_bitmap_bh,
767 ino, group, mode)) {
657 /* we won it */ 768 /* we won it */
658 BUFFER_TRACE(bitmap_bh, 769 BUFFER_TRACE(inode_bitmap_bh,
659 "call ext4_journal_dirty_metadata"); 770 "call ext4_handle_dirty_metadata");
660 err = ext4_journal_dirty_metadata(handle, 771 err = ext4_handle_dirty_metadata(handle,
661 bitmap_bh); 772 inode,
773 inode_bitmap_bh);
662 if (err) 774 if (err)
663 goto fail; 775 goto fail;
776 /* zero bit is inode number 1*/
777 ino++;
664 goto got; 778 goto got;
665 } 779 }
666 /* we lost it */ 780 /* we lost it */
667 jbd2_journal_release_buffer(handle, bitmap_bh); 781 ext4_handle_release_buffer(handle, inode_bitmap_bh);
782 ext4_handle_release_buffer(handle, group_desc_bh);
668 783
669 if (++ino < EXT4_INODES_PER_GROUP(sb)) 784 if (++ino < EXT4_INODES_PER_GROUP(sb))
670 goto repeat_in_this_group; 785 goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
684 goto out; 799 goto out;
685 800
686got: 801got:
687 ino++;
688 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
689 ino > EXT4_INODES_PER_GROUP(sb)) {
690 ext4_error(sb, __func__,
691 "reserved inode or inode > inodes count - "
692 "block_group = %lu, inode=%lu", group,
693 ino + group * EXT4_INODES_PER_GROUP(sb));
694 err = -EIO;
695 goto fail;
696 }
697
698 BUFFER_TRACE(bh2, "get_write_access");
699 err = ext4_journal_get_write_access(handle, bh2);
700 if (err) goto fail;
701
702 /* We may have to initialize the block bitmap if it isn't already */ 802 /* We may have to initialize the block bitmap if it isn't already */
703 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 803 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
704 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 804 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
705 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); 805 struct buffer_head *block_bitmap_bh;
706 806
707 BUFFER_TRACE(block_bh, "get block bitmap access"); 807 block_bitmap_bh = ext4_read_block_bitmap(sb, group);
708 err = ext4_journal_get_write_access(handle, block_bh); 808 BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
809 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
709 if (err) { 810 if (err) {
710 brelse(block_bh); 811 brelse(block_bitmap_bh);
711 goto fail; 812 goto fail;
712 } 813 }
713 814
@@ -715,9 +816,9 @@ got:
715 spin_lock(sb_bgl_lock(sbi, group)); 816 spin_lock(sb_bgl_lock(sbi, group));
716 /* recheck and clear flag under lock if we still need to */ 817 /* recheck and clear flag under lock if we still need to */
717 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 818 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
718 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
719 free = ext4_free_blocks_after_init(sb, group, gdp); 819 free = ext4_free_blocks_after_init(sb, group, gdp);
720 gdp->bg_free_blocks_count = cpu_to_le16(free); 820 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
821 ext4_free_blks_set(sb, gdp, free);
721 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 822 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
722 gdp); 823 gdp);
723 } 824 }
@@ -725,55 +826,19 @@ got:
725 826
726 /* Don't need to dirty bitmap block if we didn't change it */ 827 /* Don't need to dirty bitmap block if we didn't change it */
727 if (free) { 828 if (free) {
728 BUFFER_TRACE(block_bh, "dirty block bitmap"); 829 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
729 err = ext4_journal_dirty_metadata(handle, block_bh); 830 err = ext4_handle_dirty_metadata(handle,
831 NULL, block_bitmap_bh);
730 } 832 }
731 833
732 brelse(block_bh); 834 brelse(block_bitmap_bh);
733 if (err) 835 if (err)
734 goto fail; 836 goto fail;
735 } 837 }
736 838 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
737 spin_lock(sb_bgl_lock(sbi, group)); 839 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
738 /* If we didn't allocate from within the initialized part of the inode 840 if (err)
739 * table then we need to initialize up to this inode. */ 841 goto fail;
740 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
741 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
742 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
743
744 /* When marking the block group with
745 * ~EXT4_BG_INODE_UNINIT we don't want to depend
746 * on the value of bg_itable_unused even though
747 * mke2fs could have initialized the same for us.
748 * Instead we calculated the value below
749 */
750
751 free = 0;
752 } else {
753 free = EXT4_INODES_PER_GROUP(sb) -
754 le16_to_cpu(gdp->bg_itable_unused);
755 }
756
757 /*
758 * Check the relative inode number against the last used
759 * relative inode number in this group. if it is greater
760 * we need to update the bg_itable_unused count
761 *
762 */
763 if (ino > free)
764 gdp->bg_itable_unused =
765 cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
766 }
767
768 le16_add_cpu(&gdp->bg_free_inodes_count, -1);
769 if (S_ISDIR(mode)) {
770 le16_add_cpu(&gdp->bg_used_dirs_count, 1);
771 }
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773 spin_unlock(sb_bgl_lock(sbi, group));
774 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
775 err = ext4_journal_dirty_metadata(handle, bh2);
776 if (err) goto fail;
777 842
778 percpu_counter_dec(&sbi->s_freeinodes_counter); 843 percpu_counter_dec(&sbi->s_freeinodes_counter);
779 if (S_ISDIR(mode)) 844 if (S_ISDIR(mode))
@@ -825,7 +890,7 @@ got:
825 890
826 ext4_set_inode_flags(inode); 891 ext4_set_inode_flags(inode);
827 if (IS_DIRSYNC(inode)) 892 if (IS_DIRSYNC(inode))
828 handle->h_sync = 1; 893 ext4_handle_sync(handle);
829 if (insert_inode_locked(inode) < 0) { 894 if (insert_inode_locked(inode) < 0) {
830 err = -EINVAL; 895 err = -EINVAL;
831 goto fail_drop; 896 goto fail_drop;
@@ -852,7 +917,7 @@ got:
852 if (err) 917 if (err)
853 goto fail_free_drop; 918 goto fail_free_drop;
854 919
855 if (test_opt(sb, EXTENTS)) { 920 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
856 /* set extent flag only for directory, file and normal symlink*/ 921 /* set extent flag only for directory, file and normal symlink*/
857 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 922 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
858 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 923 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -867,6 +932,8 @@ got:
867 } 932 }
868 933
869 ext4_debug("allocating inode %lu\n", inode->i_ino); 934 ext4_debug("allocating inode %lu\n", inode->i_ino);
935 trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
936 sb->s_id, inode->i_ino, dir->i_ino, mode);
870 goto really_out; 937 goto really_out;
871fail: 938fail:
872 ext4_std_error(sb, err); 939 ext4_std_error(sb, err);
@@ -874,7 +941,7 @@ out:
874 iput(inode); 941 iput(inode);
875 ret = ERR_PTR(err); 942 ret = ERR_PTR(err);
876really_out: 943really_out:
877 brelse(bitmap_bh); 944 brelse(inode_bitmap_bh);
878 return ret; 945 return ret;
879 946
880fail_free_drop: 947fail_free_drop:
@@ -886,7 +953,7 @@ fail_drop:
886 inode->i_nlink = 0; 953 inode->i_nlink = 0;
887 unlock_new_inode(inode); 954 unlock_new_inode(inode);
888 iput(inode); 955 iput(inode);
889 brelse(bitmap_bh); 956 brelse(inode_bitmap_bh);
890 return ERR_PTR(err); 957 return ERR_PTR(err);
891} 958}
892 959
@@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
985 gdp = ext4_get_group_desc(sb, i, NULL); 1052 gdp = ext4_get_group_desc(sb, i, NULL);
986 if (!gdp) 1053 if (!gdp)
987 continue; 1054 continue;
988 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1055 desc_count += ext4_free_inodes_count(sb, gdp);
989 brelse(bitmap_bh); 1056 brelse(bitmap_bh);
990 bitmap_bh = ext4_read_inode_bitmap(sb, i); 1057 bitmap_bh = ext4_read_inode_bitmap(sb, i);
991 if (!bitmap_bh) 1058 if (!bitmap_bh)
@@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
993 1060
994 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1061 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
995 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1062 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
996 i, le16_to_cpu(gdp->bg_free_inodes_count), x); 1063 i, ext4_free_inodes_count(sb, gdp), x);
997 bitmap_count += x; 1064 bitmap_count += x;
998 } 1065 }
999 brelse(bitmap_bh); 1066 brelse(bitmap_bh);
@@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1007 gdp = ext4_get_group_desc(sb, i, NULL); 1074 gdp = ext4_get_group_desc(sb, i, NULL);
1008 if (!gdp) 1075 if (!gdp)
1009 continue; 1076 continue;
1010 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1077 desc_count += ext4_free_inodes_count(sb, gdp);
1011 cond_resched(); 1078 cond_resched();
1012 } 1079 }
1013 return desc_count; 1080 return desc_count;
@@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1024 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1091 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1025 if (!gdp) 1092 if (!gdp)
1026 continue; 1093 continue;
1027 count += le16_to_cpu(gdp->bg_used_dirs_count); 1094 count += ext4_used_dirs_count(sb, gdp);
1028 } 1095 }
1029 return count; 1096 return count;
1030} 1097}
1031
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 98d3fe7057ef..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
72 * "bh" may be NULL: a metadata block may have been freed from memory 72 * "bh" may be NULL: a metadata block may have been freed from memory
73 * but there may still be a record of it in the journal, and that record 73 * but there may still be a record of it in the journal, and that record
74 * still needs to be revoked. 74 * still needs to be revoked.
75 *
76 * If the handle isn't valid we're not journaling so there's nothing to do.
75 */ 77 */
76int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 78int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
77 struct buffer_head *bh, ext4_fsblk_t blocknr) 79 struct buffer_head *bh, ext4_fsblk_t blocknr)
78{ 80{
79 int err; 81 int err;
80 82
83 if (!ext4_handle_valid(handle))
84 return 0;
85
81 might_sleep(); 86 might_sleep();
82 87
83 BUFFER_TRACE(bh, "enter"); 88 BUFFER_TRACE(bh, "enter");
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
170 */ 175 */
171static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 176static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
172{ 177{
173 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 178 if (!ext4_handle_valid(handle))
179 return 0;
180 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
174 return 0; 181 return 0;
175 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 182 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
176 return 0; 183 return 0;
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
184 */ 191 */
185static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 192static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
186{ 193{
194 BUG_ON(EXT4_JOURNAL(inode) == NULL);
187 jbd_debug(2, "restarting handle %p\n", handle); 195 jbd_debug(2, "restarting handle %p\n", handle);
188 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 196 return ext4_journal_restart(handle, blocks_for_truncate(inode));
189} 197}
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
216 } 224 }
217 225
218 if (IS_SYNC(inode)) 226 if (IS_SYNC(inode))
219 handle->h_sync = 1; 227 ext4_handle_sync(handle);
220 inode->i_size = 0; 228 inode->i_size = 0;
221 err = ext4_mark_inode_dirty(handle, inode); 229 err = ext4_mark_inode_dirty(handle, inode);
222 if (err) { 230 if (err) {
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
233 * enough credits left in the handle to remove the inode from 241 * enough credits left in the handle to remove the inode from
234 * the orphan list and set the dtime field. 242 * the orphan list and set the dtime field.
235 */ 243 */
236 if (handle->h_buffer_credits < 3) { 244 if (!ext4_handle_has_enough_credits(handle, 3)) {
237 err = ext4_journal_extend(handle, 3); 245 err = ext4_journal_extend(handle, 3);
238 if (err > 0) 246 if (err > 0)
239 err = ext4_journal_restart(handle, 3); 247 err = ext4_journal_restart(handle, 3);
@@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
506 * return the total number of blocks to be allocate, including the 514 * return the total number of blocks to be allocate, including the
507 * direct and indirect blocks. 515 * direct and indirect blocks.
508 */ 516 */
509static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 517static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
510 int blocks_to_boundary) 518 int blocks_to_boundary)
511{ 519{
512 unsigned long count = 0; 520 unsigned int count = 0;
513 521
514 /* 522 /*
515 * Simple case, [t,d]Indirect block(s) has not allocated yet 523 * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
547 int indirect_blks, int blks, 555 int indirect_blks, int blks,
548 ext4_fsblk_t new_blocks[4], int *err) 556 ext4_fsblk_t new_blocks[4], int *err)
549{ 557{
558 struct ext4_allocation_request ar;
550 int target, i; 559 int target, i;
551 unsigned long count = 0, blk_allocated = 0; 560 unsigned long count = 0, blk_allocated = 0;
552 int index = 0; 561 int index = 0;
@@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
595 if (!target) 604 if (!target)
596 goto allocated; 605 goto allocated;
597 /* Now allocate data blocks */ 606 /* Now allocate data blocks */
598 count = target; 607 memset(&ar, 0, sizeof(ar));
599 /* allocating blocks for data blocks */ 608 ar.inode = inode;
600 current_block = ext4_new_blocks(handle, inode, iblock, 609 ar.goal = goal;
601 goal, &count, err); 610 ar.len = target;
611 ar.logical = iblock;
612 if (S_ISREG(inode->i_mode))
613 /* enable in-core preallocation only for regular files */
614 ar.flags = EXT4_MB_HINT_DATA;
615
616 current_block = ext4_mb_new_blocks(handle, &ar, err);
617
602 if (*err && (target == blks)) { 618 if (*err && (target == blks)) {
603 /* 619 /*
604 * if the allocation failed and we didn't allocate 620 * if the allocation failed and we didn't allocate
@@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
614 */ 630 */
615 new_blocks[index] = current_block; 631 new_blocks[index] = current_block;
616 } 632 }
617 blk_allocated += count; 633 blk_allocated += ar.len;
618 } 634 }
619allocated: 635allocated:
620 /* total number of blocks allocated for direct blocks */ 636 /* total number of blocks allocated for direct blocks */
@@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
709 set_buffer_uptodate(bh); 725 set_buffer_uptodate(bh);
710 unlock_buffer(bh); 726 unlock_buffer(bh);
711 727
712 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 728 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
713 err = ext4_journal_dirty_metadata(handle, bh); 729 err = ext4_handle_dirty_metadata(handle, inode, bh);
714 if (err) 730 if (err)
715 goto failed; 731 goto failed;
716 } 732 }
@@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
792 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 808 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
793 */ 809 */
794 jbd_debug(5, "splicing indirect only\n"); 810 jbd_debug(5, "splicing indirect only\n");
795 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); 811 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
796 err = ext4_journal_dirty_metadata(handle, where->bh); 812 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
797 if (err) 813 if (err)
798 goto err_out; 814 goto err_out;
799 } else { 815 } else {
@@ -840,10 +856,10 @@ err_out:
840 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 856 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
841 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 857 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
842 */ 858 */
843int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 859static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
844 ext4_lblk_t iblock, unsigned long maxblocks, 860 ext4_lblk_t iblock, unsigned int maxblocks,
845 struct buffer_head *bh_result, 861 struct buffer_head *bh_result,
846 int create, int extend_disksize) 862 int create, int extend_disksize)
847{ 863{
848 int err = -EIO; 864 int err = -EIO;
849 ext4_lblk_t offsets[4]; 865 ext4_lblk_t offsets[4];
@@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1045 * It returns the error in case of allocation failure. 1061 * It returns the error in case of allocation failure.
1046 */ 1062 */
1047int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1063int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1048 unsigned long max_blocks, struct buffer_head *bh, 1064 unsigned int max_blocks, struct buffer_head *bh,
1049 int create, int extend_disksize, int flag) 1065 int create, int extend_disksize, int flag)
1050{ 1066{
1051 int retval; 1067 int retval;
@@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1221 set_buffer_uptodate(bh); 1237 set_buffer_uptodate(bh);
1222 } 1238 }
1223 unlock_buffer(bh); 1239 unlock_buffer(bh);
1224 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1240 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1225 err = ext4_journal_dirty_metadata(handle, bh); 1241 err = ext4_handle_dirty_metadata(handle, inode, bh);
1226 if (!fatal) 1242 if (!fatal)
1227 fatal = err; 1243 fatal = err;
1228 } else { 1244 } else {
@@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1335 pgoff_t index; 1351 pgoff_t index;
1336 unsigned from, to; 1352 unsigned from, to;
1337 1353
1354 trace_mark(ext4_write_begin,
1355 "dev %s ino %lu pos %llu len %u flags %u",
1356 inode->i_sb->s_id, inode->i_ino,
1357 (unsigned long long) pos, len, flags);
1338 index = pos >> PAGE_CACHE_SHIFT; 1358 index = pos >> PAGE_CACHE_SHIFT;
1339 from = pos & (PAGE_CACHE_SIZE - 1); 1359 from = pos & (PAGE_CACHE_SIZE - 1);
1340 to = from + len; 1360 to = from + len;
@@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1387 if (!buffer_mapped(bh) || buffer_freed(bh)) 1407 if (!buffer_mapped(bh) || buffer_freed(bh))
1388 return 0; 1408 return 0;
1389 set_buffer_uptodate(bh); 1409 set_buffer_uptodate(bh);
1390 return ext4_journal_dirty_metadata(handle, bh); 1410 return ext4_handle_dirty_metadata(handle, NULL, bh);
1391} 1411}
1392 1412
1393/* 1413/*
@@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
1406 struct inode *inode = mapping->host; 1426 struct inode *inode = mapping->host;
1407 int ret = 0, ret2; 1427 int ret = 0, ret2;
1408 1428
1429 trace_mark(ext4_ordered_write_end,
1430 "dev %s ino %lu pos %llu len %u copied %u",
1431 inode->i_sb->s_id, inode->i_ino,
1432 (unsigned long long) pos, len, copied);
1409 ret = ext4_jbd2_file_inode(handle, inode); 1433 ret = ext4_jbd2_file_inode(handle, inode);
1410 1434
1411 if (ret == 0) { 1435 if (ret == 0) {
@@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
1444 int ret = 0, ret2; 1468 int ret = 0, ret2;
1445 loff_t new_i_size; 1469 loff_t new_i_size;
1446 1470
1471 trace_mark(ext4_writeback_write_end,
1472 "dev %s ino %lu pos %llu len %u copied %u",
1473 inode->i_sb->s_id, inode->i_ino,
1474 (unsigned long long) pos, len, copied);
1447 new_i_size = pos + copied; 1475 new_i_size = pos + copied;
1448 if (new_i_size > EXT4_I(inode)->i_disksize) { 1476 if (new_i_size > EXT4_I(inode)->i_disksize) {
1449 ext4_update_i_disksize(inode, new_i_size); 1477 ext4_update_i_disksize(inode, new_i_size);
@@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
1479 unsigned from, to; 1507 unsigned from, to;
1480 loff_t new_i_size; 1508 loff_t new_i_size;
1481 1509
1510 trace_mark(ext4_journalled_write_end,
1511 "dev %s ino %lu pos %llu len %u copied %u",
1512 inode->i_sb->s_id, inode->i_ino,
1513 (unsigned long long) pos, len, copied);
1482 from = pos & (PAGE_CACHE_SIZE - 1); 1514 from = pos & (PAGE_CACHE_SIZE - 1);
1483 to = from + len; 1515 to = from + len;
1484 1516
@@ -1625,7 +1657,7 @@ struct mpage_da_data {
1625 get_block_t *get_block; 1657 get_block_t *get_block;
1626 struct writeback_control *wbc; 1658 struct writeback_control *wbc;
1627 int io_done; 1659 int io_done;
1628 long pages_written; 1660 int pages_written;
1629 int retval; 1661 int retval;
1630}; 1662};
1631 1663
@@ -1645,35 +1677,39 @@ struct mpage_da_data {
1645 */ 1677 */
1646static int mpage_da_submit_io(struct mpage_da_data *mpd) 1678static int mpage_da_submit_io(struct mpage_da_data *mpd)
1647{ 1679{
1648 struct address_space *mapping = mpd->inode->i_mapping;
1649 int ret = 0, err, nr_pages, i;
1650 unsigned long index, end;
1651 struct pagevec pvec;
1652 long pages_skipped; 1680 long pages_skipped;
1681 struct pagevec pvec;
1682 unsigned long index, end;
1683 int ret = 0, err, nr_pages, i;
1684 struct inode *inode = mpd->inode;
1685 struct address_space *mapping = inode->i_mapping;
1653 1686
1654 BUG_ON(mpd->next_page <= mpd->first_page); 1687 BUG_ON(mpd->next_page <= mpd->first_page);
1655 pagevec_init(&pvec, 0); 1688 /*
1689 * We need to start from the first_page to the next_page - 1
1690 * to make sure we also write the mapped dirty buffer_heads.
1691 * If we look at mpd->lbh.b_blocknr we would only be looking
1692 * at the currently mapped buffer_heads.
1693 */
1656 index = mpd->first_page; 1694 index = mpd->first_page;
1657 end = mpd->next_page - 1; 1695 end = mpd->next_page - 1;
1658 1696
1697 pagevec_init(&pvec, 0);
1659 while (index <= end) { 1698 while (index <= end) {
1660 /* 1699 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1661 * We can use PAGECACHE_TAG_DIRTY lookup here because
1662 * even though we have cleared the dirty flag on the page
1663 * We still keep the page in the radix tree with tag
1664 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
1665 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
1666 * which is called via the below writepage callback.
1667 */
1668 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1669 PAGECACHE_TAG_DIRTY,
1670 min(end - index,
1671 (pgoff_t)PAGEVEC_SIZE-1) + 1);
1672 if (nr_pages == 0) 1700 if (nr_pages == 0)
1673 break; 1701 break;
1674 for (i = 0; i < nr_pages; i++) { 1702 for (i = 0; i < nr_pages; i++) {
1675 struct page *page = pvec.pages[i]; 1703 struct page *page = pvec.pages[i];
1676 1704
1705 index = page->index;
1706 if (index > end)
1707 break;
1708 index++;
1709
1710 BUG_ON(!PageLocked(page));
1711 BUG_ON(PageWriteback(page));
1712
1677 pages_skipped = mpd->wbc->pages_skipped; 1713 pages_skipped = mpd->wbc->pages_skipped;
1678 err = mapping->a_ops->writepage(page, mpd->wbc); 1714 err = mapping->a_ops->writepage(page, mpd->wbc);
1679 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1715 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
1831 ext4_count_free_blocks(inode->i_sb)); 1867 ext4_count_free_blocks(inode->i_sb));
1832 printk(KERN_EMERG "Free/Dirty block details\n"); 1868 printk(KERN_EMERG "Free/Dirty block details\n");
1833 printk(KERN_EMERG "free_blocks=%lld\n", 1869 printk(KERN_EMERG "free_blocks=%lld\n",
1834 percpu_counter_sum(&sbi->s_freeblocks_counter)); 1870 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
1835 printk(KERN_EMERG "dirty_blocks=%lld\n", 1871 printk(KERN_EMERG "dirty_blocks=%lld\n",
1836 percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1872 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1837 printk(KERN_EMERG "Block reservation details\n"); 1873 printk(KERN_EMERG "Block reservation details\n");
1838 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", 1874 printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
1839 EXT4_I(inode)->i_reserved_data_blocks); 1875 EXT4_I(inode)->i_reserved_data_blocks);
1840 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", 1876 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
1841 EXT4_I(inode)->i_reserved_meta_blocks); 1877 EXT4_I(inode)->i_reserved_meta_blocks);
1842 return; 1878 return;
1843} 1879}
@@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
2087 bh = head; 2123 bh = head;
2088 do { 2124 do {
2089 BUG_ON(buffer_locked(bh)); 2125 BUG_ON(buffer_locked(bh));
2126 /*
2127 * We need to try to allocate
2128 * unmapped blocks in the same page.
2129 * Otherwise we won't make progress
2130 * with the page in ext4_da_writepage
2131 */
2090 if (buffer_dirty(bh) && 2132 if (buffer_dirty(bh) &&
2091 (!buffer_mapped(bh) || buffer_delay(bh))) { 2133 (!buffer_mapped(bh) || buffer_delay(bh))) {
2092 mpage_add_bh_to_extent(mpd, logical, bh); 2134 mpage_add_bh_to_extent(mpd, logical, bh);
2093 if (mpd->io_done) 2135 if (mpd->io_done)
2094 return MPAGE_DA_EXTENT_TAIL; 2136 return MPAGE_DA_EXTENT_TAIL;
2137 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2138 /*
2139 * mapped dirty buffer. We need to update
2140 * the b_state because we look at
2141 * b_state in mpage_da_map_blocks. We don't
2142 * update b_size because if we find an
2143 * unmapped buffer_head later we need to
2144 * use the b_state flag of that buffer_head.
2145 */
2146 if (mpd->lbh.b_size == 0)
2147 mpd->lbh.b_state =
2148 bh->b_state & BH_FLAGS;
2095 } 2149 }
2096 logical++; 2150 logical++;
2097 } while ((bh = bh->b_this_page) != head); 2151 } while ((bh = bh->b_this_page) != head);
@@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
2269{ 2323{
2270 int ret = 0; 2324 int ret = 0;
2271 loff_t size; 2325 loff_t size;
2272 unsigned long len; 2326 unsigned int len;
2273 struct buffer_head *page_bufs; 2327 struct buffer_head *page_bufs;
2274 struct inode *inode = page->mapping->host; 2328 struct inode *inode = page->mapping->host;
2275 2329
2330 trace_mark(ext4_da_writepage,
2331 "dev %s ino %lu page_index %lu",
2332 inode->i_sb->s_id, inode->i_ino, page->index);
2276 size = i_size_read(inode); 2333 size = i_size_read(inode);
2277 if (page->index == size >> PAGE_CACHE_SHIFT) 2334 if (page->index == size >> PAGE_CACHE_SHIFT)
2278 len = size & ~PAGE_CACHE_MASK; 2335 len = size & ~PAGE_CACHE_MASK;
@@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
2378 struct mpage_da_data mpd; 2435 struct mpage_da_data mpd;
2379 struct inode *inode = mapping->host; 2436 struct inode *inode = mapping->host;
2380 int no_nrwrite_index_update; 2437 int no_nrwrite_index_update;
2381 long pages_written = 0, pages_skipped; 2438 int pages_written = 0;
2439 long pages_skipped;
2382 int needed_blocks, ret = 0, nr_to_writebump = 0; 2440 int needed_blocks, ret = 0, nr_to_writebump = 0;
2383 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2441 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2384 2442
2443 trace_mark(ext4_da_writepages,
2444 "dev %s ino %lu nr_t_write %ld "
2445 "pages_skipped %ld range_start %llu "
2446 "range_end %llu nonblocking %d "
2447 "for_kupdate %d for_reclaim %d "
2448 "for_writepages %d range_cyclic %d",
2449 inode->i_sb->s_id, inode->i_ino,
2450 wbc->nr_to_write, wbc->pages_skipped,
2451 (unsigned long long) wbc->range_start,
2452 (unsigned long long) wbc->range_end,
2453 wbc->nonblocking, wbc->for_kupdate,
2454 wbc->for_reclaim, wbc->for_writepages,
2455 wbc->range_cyclic);
2456
2385 /* 2457 /*
2386 * No pages to write? This is mainly a kludge to avoid starting 2458 * No pages to write? This is mainly a kludge to avoid starting
2387 * a transaction for special inodes like journal inode on last iput() 2459 * a transaction for special inodes like journal inode on last iput()
@@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
2389 */ 2461 */
2390 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2462 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2391 return 0; 2463 return 0;
2464
2465 /*
2466 * If the filesystem has aborted, it is read-only, so return
2467 * right away instead of dumping stack traces later on that
2468 * will obscure the real source of the problem. We test
2469 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
2470 * the latter could be true if the filesystem is mounted
2471 * read-only, and in that case, ext4_da_writepages should
2472 * *never* be called, so if that ever happens, we would want
2473 * the stack trace.
2474 */
2475 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
2476 return -EROFS;
2477
2392 /* 2478 /*
2393 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2479 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2394 * This make sure small files blocks are allocated in 2480 * This make sure small files blocks are allocated in
@@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2433 handle = ext4_journal_start(inode, needed_blocks); 2519 handle = ext4_journal_start(inode, needed_blocks);
2434 if (IS_ERR(handle)) { 2520 if (IS_ERR(handle)) {
2435 ret = PTR_ERR(handle); 2521 ret = PTR_ERR(handle);
2436 printk(KERN_EMERG "%s: jbd2_start: " 2522 printk(KERN_CRIT "%s: jbd2_start: "
2437 "%ld pages, ino %lu; err %d\n", __func__, 2523 "%ld pages, ino %lu; err %d\n", __func__,
2438 wbc->nr_to_write, inode->i_ino, ret); 2524 wbc->nr_to_write, inode->i_ino, ret);
2439 dump_stack(); 2525 dump_stack();
@@ -2486,6 +2572,14 @@ out_writepages:
2486 if (!no_nrwrite_index_update) 2572 if (!no_nrwrite_index_update)
2487 wbc->no_nrwrite_index_update = 0; 2573 wbc->no_nrwrite_index_update = 0;
2488 wbc->nr_to_write -= nr_to_writebump; 2574 wbc->nr_to_write -= nr_to_writebump;
2575 trace_mark(ext4_da_writepage_result,
2576 "dev %s ino %lu ret %d pages_written %d "
2577 "pages_skipped %ld congestion %d "
2578 "more_io %d no_nrwrite_index_update %d",
2579 inode->i_sb->s_id, inode->i_ino, ret,
2580 pages_written, wbc->pages_skipped,
2581 wbc->encountered_congestion, wbc->more_io,
2582 wbc->no_nrwrite_index_update);
2489 return ret; 2583 return ret;
2490} 2584}
2491 2585
@@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2537 len, flags, pagep, fsdata); 2631 len, flags, pagep, fsdata);
2538 } 2632 }
2539 *fsdata = (void *)0; 2633 *fsdata = (void *)0;
2634
2635 trace_mark(ext4_da_write_begin,
2636 "dev %s ino %lu pos %llu len %u flags %u",
2637 inode->i_sb->s_id, inode->i_ino,
2638 (unsigned long long) pos, len, flags);
2540retry: 2639retry:
2541 /* 2640 /*
2542 * With delayed allocation, we don't log the i_disksize update 2641 * With delayed allocation, we don't log the i_disksize update
@@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
2626 } 2725 }
2627 } 2726 }
2628 2727
2728 trace_mark(ext4_da_write_end,
2729 "dev %s ino %lu pos %llu len %u copied %u",
2730 inode->i_sb->s_id, inode->i_ino,
2731 (unsigned long long) pos, len, copied);
2629 start = pos & (PAGE_CACHE_SIZE - 1); 2732 start = pos & (PAGE_CACHE_SIZE - 1);
2630 end = start + copied - 1; 2733 end = start + copied - 1;
2631 2734
@@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2718 filemap_write_and_wait(mapping); 2821 filemap_write_and_wait(mapping);
2719 } 2822 }
2720 2823
2721 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2824 BUG_ON(!EXT4_JOURNAL(inode) &&
2825 EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
2826
2827 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2722 /* 2828 /*
2723 * This is a REALLY heavyweight approach, but the use of 2829 * This is a REALLY heavyweight approach, but the use of
2724 * bmap on dirty files is expected to be extremely rare: 2830 * bmap on dirty files is expected to be extremely rare:
@@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
2836 loff_t size = i_size_read(inode); 2942 loff_t size = i_size_read(inode);
2837 loff_t len; 2943 loff_t len;
2838 2944
2945 trace_mark(ext4_normal_writepage,
2946 "dev %s ino %lu page_index %lu",
2947 inode->i_sb->s_id, inode->i_ino, page->index);
2839 J_ASSERT(PageLocked(page)); 2948 J_ASSERT(PageLocked(page));
2840 if (page->index == size >> PAGE_CACHE_SHIFT) 2949 if (page->index == size >> PAGE_CACHE_SHIFT)
2841 len = size & ~PAGE_CACHE_MASK; 2950 len = size & ~PAGE_CACHE_MASK;
@@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
2921 loff_t size = i_size_read(inode); 3030 loff_t size = i_size_read(inode);
2922 loff_t len; 3031 loff_t len;
2923 3032
3033 trace_mark(ext4_journalled_writepage,
3034 "dev %s ino %lu page_index %lu",
3035 inode->i_sb->s_id, inode->i_ino, page->index);
2924 J_ASSERT(PageLocked(page)); 3036 J_ASSERT(PageLocked(page));
2925 if (page->index == size >> PAGE_CACHE_SHIFT) 3037 if (page->index == size >> PAGE_CACHE_SHIFT)
2926 len = size & ~PAGE_CACHE_MASK; 3038 len = size & ~PAGE_CACHE_MASK;
@@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
2989 if (offset == 0) 3101 if (offset == 0)
2990 ClearPageChecked(page); 3102 ClearPageChecked(page);
2991 3103
2992 jbd2_journal_invalidatepage(journal, page, offset); 3104 if (journal)
3105 jbd2_journal_invalidatepage(journal, page, offset);
3106 else
3107 block_invalidatepage(page, offset);
2993} 3108}
2994 3109
2995static int ext4_releasepage(struct page *page, gfp_t wait) 3110static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
2999 WARN_ON(PageChecked(page)); 3114 WARN_ON(PageChecked(page));
3000 if (!page_has_buffers(page)) 3115 if (!page_has_buffers(page))
3001 return 0; 3116 return 0;
3002 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3117 if (journal)
3118 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3119 else
3120 return try_to_free_buffers(page);
3003} 3121}
3004 3122
3005/* 3123/*
@@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
3271 3389
3272 err = 0; 3390 err = 0;
3273 if (ext4_should_journal_data(inode)) { 3391 if (ext4_should_journal_data(inode)) {
3274 err = ext4_journal_dirty_metadata(handle, bh); 3392 err = ext4_handle_dirty_metadata(handle, inode, bh);
3275 } else { 3393 } else {
3276 if (ext4_should_order_data(inode)) 3394 if (ext4_should_order_data(inode))
3277 err = ext4_jbd2_file_inode(handle, inode); 3395 err = ext4_jbd2_file_inode(handle, inode);
@@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3395 __le32 *p; 3513 __le32 *p;
3396 if (try_to_extend_transaction(handle, inode)) { 3514 if (try_to_extend_transaction(handle, inode)) {
3397 if (bh) { 3515 if (bh) {
3398 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 3516 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3399 ext4_journal_dirty_metadata(handle, bh); 3517 ext4_handle_dirty_metadata(handle, inode, bh);
3400 } 3518 }
3401 ext4_mark_inode_dirty(handle, inode); 3519 ext4_mark_inode_dirty(handle, inode);
3402 ext4_journal_test_restart(handle, inode); 3520 ext4_journal_test_restart(handle, inode);
@@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3496 count, block_to_free_p, p); 3614 count, block_to_free_p, p);
3497 3615
3498 if (this_bh) { 3616 if (this_bh) {
3499 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3617 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
3500 3618
3501 /* 3619 /*
3502 * The buffer head should have an attached journal head at this 3620 * The buffer head should have an attached journal head at this
@@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3505 * the block was cleared. Check for this instead of OOPSing. 3623 * the block was cleared. Check for this instead of OOPSing.
3506 */ 3624 */
3507 if (bh2jh(this_bh)) 3625 if (bh2jh(this_bh))
3508 ext4_journal_dirty_metadata(handle, this_bh); 3626 ext4_handle_dirty_metadata(handle, inode, this_bh);
3509 else 3627 else
3510 ext4_error(inode->i_sb, __func__, 3628 ext4_error(inode->i_sb, __func__,
3511 "circular indirect block detected, " 3629 "circular indirect block detected, "
@@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3535 ext4_fsblk_t nr; 3653 ext4_fsblk_t nr;
3536 __le32 *p; 3654 __le32 *p;
3537 3655
3538 if (is_handle_aborted(handle)) 3656 if (ext4_handle_is_aborted(handle))
3539 return; 3657 return;
3540 3658
3541 if (depth--) { 3659 if (depth--) {
@@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3605 * will merely complain about releasing a free block, 3723 * will merely complain about releasing a free block,
3606 * rather than leaking blocks. 3724 * rather than leaking blocks.
3607 */ 3725 */
3608 if (is_handle_aborted(handle)) 3726 if (ext4_handle_is_aborted(handle))
3609 return; 3727 return;
3610 if (try_to_extend_transaction(handle, inode)) { 3728 if (try_to_extend_transaction(handle, inode)) {
3611 ext4_mark_inode_dirty(handle, inode); 3729 ext4_mark_inode_dirty(handle, inode);
@@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3624 parent_bh)){ 3742 parent_bh)){
3625 *p = 0; 3743 *p = 0;
3626 BUFFER_TRACE(parent_bh, 3744 BUFFER_TRACE(parent_bh,
3627 "call ext4_journal_dirty_metadata"); 3745 "call ext4_handle_dirty_metadata");
3628 ext4_journal_dirty_metadata(handle, 3746 ext4_handle_dirty_metadata(handle,
3629 parent_bh); 3747 inode,
3748 parent_bh);
3630 } 3749 }
3631 } 3750 }
3632 } 3751 }
@@ -3814,7 +3933,7 @@ do_indirects:
3814 * synchronous 3933 * synchronous
3815 */ 3934 */
3816 if (IS_SYNC(inode)) 3935 if (IS_SYNC(inode))
3817 handle->h_sync = 1; 3936 ext4_handle_sync(handle);
3818out_stop: 3937out_stop:
3819 /* 3938 /*
3820 * If this was a simple ftruncate(), and the file will remain alive 3939 * If this was a simple ftruncate(), and the file will remain alive
@@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
3844 ext4_fsblk_t block; 3963 ext4_fsblk_t block;
3845 int inodes_per_block, inode_offset; 3964 int inodes_per_block, inode_offset;
3846 3965
3847 iloc->bh = 0; 3966 iloc->bh = NULL;
3848 if (!ext4_valid_inum(sb, inode->i_ino)) 3967 if (!ext4_valid_inum(sb, inode->i_ino))
3849 return -EIO; 3968 return -EIO;
3850 3969
@@ -3951,7 +4070,7 @@ make_io:
3951 num = EXT4_INODES_PER_GROUP(sb); 4070 num = EXT4_INODES_PER_GROUP(sb);
3952 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4071 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3953 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4072 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3954 num -= le16_to_cpu(gdp->bg_itable_unused); 4073 num -= ext4_itable_unused_count(sb, gdp);
3955 table += num / inodes_per_block; 4074 table += num / inodes_per_block;
3956 if (end > table) 4075 if (end > table)
3957 end = table; 4076 end = table;
@@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
4313 EXT4_SET_RO_COMPAT_FEATURE(sb, 4432 EXT4_SET_RO_COMPAT_FEATURE(sb,
4314 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4433 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4315 sb->s_dirt = 1; 4434 sb->s_dirt = 1;
4316 handle->h_sync = 1; 4435 ext4_handle_sync(handle);
4317 err = ext4_journal_dirty_metadata(handle, 4436 err = ext4_handle_dirty_metadata(handle, inode,
4318 EXT4_SB(sb)->s_sbh); 4437 EXT4_SB(sb)->s_sbh);
4319 } 4438 }
4320 } 4439 }
@@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
4341 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4460 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4342 } 4461 }
4343 4462
4344 4463 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4345 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 4464 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4346 rc = ext4_journal_dirty_metadata(handle, bh);
4347 if (!err) 4465 if (!err)
4348 err = rc; 4466 err = rc;
4349 ei->i_state &= ~EXT4_STATE_NEW; 4467 ei->i_state &= ~EXT4_STATE_NEW;
@@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
4406 return ext4_force_commit(inode->i_sb); 4524 return ext4_force_commit(inode->i_sb);
4407} 4525}
4408 4526
4527int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4528{
4529 int err = 0;
4530
4531 mark_buffer_dirty(bh);
4532 if (inode && inode_needs_sync(inode)) {
4533 sync_dirty_buffer(bh);
4534 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4535 ext4_error(inode->i_sb, __func__,
4536 "IO error syncing inode, "
4537 "inode=%lu, block=%llu",
4538 inode->i_ino,
4539 (unsigned long long)bh->b_blocknr);
4540 err = -EIO;
4541 }
4542 }
4543 return err;
4544}
4545
4409/* 4546/*
4410 * ext4_setattr() 4547 * ext4_setattr()
4411 * 4548 *
@@ -4710,16 +4847,15 @@ int
4710ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4847ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4711 struct ext4_iloc *iloc) 4848 struct ext4_iloc *iloc)
4712{ 4849{
4713 int err = 0; 4850 int err;
4714 if (handle) { 4851
4715 err = ext4_get_inode_loc(inode, iloc); 4852 err = ext4_get_inode_loc(inode, iloc);
4716 if (!err) { 4853 if (!err) {
4717 BUFFER_TRACE(iloc->bh, "get_write_access"); 4854 BUFFER_TRACE(iloc->bh, "get_write_access");
4718 err = ext4_journal_get_write_access(handle, iloc->bh); 4855 err = ext4_journal_get_write_access(handle, iloc->bh);
4719 if (err) { 4856 if (err) {
4720 brelse(iloc->bh); 4857 brelse(iloc->bh);
4721 iloc->bh = NULL; 4858 iloc->bh = NULL;
4722 }
4723 } 4859 }
4724 } 4860 }
4725 ext4_std_error(inode->i_sb, err); 4861 ext4_std_error(inode->i_sb, err);
@@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4791 4927
4792 might_sleep(); 4928 might_sleep();
4793 err = ext4_reserve_inode_write(handle, inode, &iloc); 4929 err = ext4_reserve_inode_write(handle, inode, &iloc);
4794 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4930 if (ext4_handle_valid(handle) &&
4931 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4795 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4932 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
4796 /* 4933 /*
4797 * We need extra buffer credits since we may write into EA block 4934 * We need extra buffer credits since we may write into EA block
@@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
4843 handle_t *current_handle = ext4_journal_current_handle(); 4980 handle_t *current_handle = ext4_journal_current_handle();
4844 handle_t *handle; 4981 handle_t *handle;
4845 4982
4983 if (!ext4_handle_valid(current_handle)) {
4984 ext4_mark_inode_dirty(current_handle, inode);
4985 return;
4986 }
4987
4846 handle = ext4_journal_start(inode, 2); 4988 handle = ext4_journal_start(inode, 2);
4847 if (IS_ERR(handle)) 4989 if (IS_ERR(handle))
4848 goto out; 4990 goto out;
@@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4880 BUFFER_TRACE(iloc.bh, "get_write_access"); 5022 BUFFER_TRACE(iloc.bh, "get_write_access");
4881 err = jbd2_journal_get_write_access(handle, iloc.bh); 5023 err = jbd2_journal_get_write_access(handle, iloc.bh);
4882 if (!err) 5024 if (!err)
4883 err = ext4_journal_dirty_metadata(handle, 5025 err = ext4_handle_dirty_metadata(handle,
4884 iloc.bh); 5026 inode,
5027 iloc.bh);
4885 brelse(iloc.bh); 5028 brelse(iloc.bh);
4886 } 5029 }
4887 } 5030 }
@@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4907 */ 5050 */
4908 5051
4909 journal = EXT4_JOURNAL(inode); 5052 journal = EXT4_JOURNAL(inode);
5053 if (!journal)
5054 return 0;
4910 if (is_journal_aborted(journal)) 5055 if (is_journal_aborted(journal))
4911 return -EROFS; 5056 return -EROFS;
4912 5057
@@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4936 return PTR_ERR(handle); 5081 return PTR_ERR(handle);
4937 5082
4938 err = ext4_mark_inode_dirty(handle, inode); 5083 err = ext4_mark_inode_dirty(handle, inode);
4939 handle->h_sync = 1; 5084 ext4_handle_sync(handle);
4940 ext4_journal_stop(handle); 5085 ext4_journal_stop(handle);
4941 ext4_std_error(inode->i_sb, err); 5086 ext4_std_error(inode->i_sb, err);
4942 5087
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
99 goto flags_out; 99 goto flags_out;
100 } 100 }
101 if (IS_SYNC(inode)) 101 if (IS_SYNC(inode))
102 handle->h_sync = 1; 102 ext4_handle_sync(handle);
103 err = ext4_reserve_inode_write(handle, inode, &iloc); 103 err = ext4_reserve_inode_write(handle, inode, &iloc);
104 if (err) 104 if (err)
105 goto flags_err; 105 goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..918aec0c8a11 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
100 * inode as: 100 * inode as:
101 * 101 *
102 * { page } 102 * { page }
103 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 103 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
104 * 104 *
105 * 105 *
106 * one block each for bitmap and buddy information. So for each group we 106 * one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,18 @@
330 * object 330 * object
331 * 331 *
332 */ 332 */
333static struct kmem_cache *ext4_pspace_cachep;
334static struct kmem_cache *ext4_ac_cachep;
335static struct kmem_cache *ext4_free_ext_cachep;
336static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343
344
333 345
334static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 346static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
335{ 347{
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
445 blocknr += first + i; 457 blocknr += first + i;
446 blocknr += 458 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 459 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 460 ext4_grp_locked_error(sb, e4b->bd_group,
449 ext4_error(sb, __func__, "double-free of inode" 461 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %lu)\n", 462 " %lu's block %llu(bit %u in group %u)",
451 inode ? inode->i_ino : 0, blocknr, 463 inode ? inode->i_ino : 0, blocknr,
452 first + i, e4b->bd_group); 464 first + i, e4b->bd_group);
453 } 465 }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 489 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 490 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 491 if (b1[i] != b2[i]) {
480 printk(KERN_ERR "corruption in group %lu " 492 printk(KERN_ERR "corruption in group %u "
481 "at byte %u(%u): %x in copy != %x " 493 "at byte %u(%u): %x in copy != %x "
482 "on disk/prealloc\n", 494 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]); 495 e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
690 grp->bb_fragments = fragments; 702 grp->bb_fragments = fragments;
691 703
692 if (free != grp->bb_free) { 704 if (free != grp->bb_free) {
693 ext4_error(sb, __func__, 705 ext4_grp_locked_error(sb, group, __func__,
694 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", 706 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
695 group, free, grp->bb_free); 707 group, free, grp->bb_free);
696 /* 708 /*
697 * If we intent to continue, we consider group descritor 709 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
716 * stored in the inode as 728 * stored in the inode as
717 * 729 *
718 * { page } 730 * { page }
719 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 731 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
720 * 732 *
721 * 733 *
722 * one block each for bitmap and buddy information. 734 * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 if (bh[i] == NULL) 794 if (bh[i] == NULL)
783 goto out; 795 goto out;
784 796
785 if (buffer_uptodate(bh[i]) && 797 if (bitmap_uptodate(bh[i]))
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
787 continue; 798 continue;
788 799
789 lock_buffer(bh[i]); 800 lock_buffer(bh[i]);
801 if (bitmap_uptodate(bh[i])) {
802 unlock_buffer(bh[i]);
803 continue;
804 }
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 805 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 806 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 807 ext4_init_block_bitmap(sb, bh[i],
793 first_group + i, desc); 808 first_group + i, desc);
809 set_bitmap_uptodate(bh[i]);
794 set_buffer_uptodate(bh[i]); 810 set_buffer_uptodate(bh[i]);
795 unlock_buffer(bh[i]);
796 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 811 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
812 unlock_buffer(bh[i]);
797 continue; 813 continue;
798 } 814 }
799 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 815 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
816 if (buffer_uptodate(bh[i])) {
817 /*
818 * if not uninit if bh is uptodate,
819 * bitmap is also uptodate
820 */
821 set_bitmap_uptodate(bh[i]);
822 unlock_buffer(bh[i]);
823 continue;
824 }
800 get_bh(bh[i]); 825 get_bh(bh[i]);
826 /*
827 * submit the buffer_head for read. We can
828 * safely mark the bitmap as uptodate now.
829 * We do it here so the bitmap uptodate bit
830 * get set with buffer lock held.
831 */
832 set_bitmap_uptodate(bh[i]);
801 bh[i]->b_end_io = end_buffer_read_sync; 833 bh[i]->b_end_io = end_buffer_read_sync;
802 submit_bh(READ, bh[i]); 834 submit_bh(READ, bh[i]);
803 mb_debug("read bitmap for group %lu\n", first_group + i); 835 mb_debug("read bitmap for group %u\n", first_group + i);
804 } 836 }
805 837
806 /* wait for I/O completion */ 838 /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
814 846
815 err = 0; 847 err = 0;
816 first_block = page->index * blocks_per_page; 848 first_block = page->index * blocks_per_page;
849 /* init the page */
850 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
817 for (i = 0; i < blocks_per_page; i++) { 851 for (i = 0; i < blocks_per_page; i++) {
818 int group; 852 int group;
819 struct ext4_group_info *grinfo; 853 struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
840 BUG_ON(incore == NULL); 874 BUG_ON(incore == NULL);
841 mb_debug("put buddy for group %u in page %lu/%x\n", 875 mb_debug("put buddy for group %u in page %lu/%x\n",
842 group, page->index, i * blocksize); 876 group, page->index, i * blocksize);
843 memset(data, 0xff, blocksize);
844 grinfo = ext4_get_group_info(sb, group); 877 grinfo = ext4_get_group_info(sb, group);
845 grinfo->bb_fragments = 0; 878 grinfo->bb_fragments = 0;
846 memset(grinfo->bb_counters, 0, 879 memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
848 /* 881 /*
849 * incore got set to the group block bitmap below 882 * incore got set to the group block bitmap below
850 */ 883 */
884 ext4_lock_group(sb, group);
851 ext4_mb_generate_buddy(sb, data, incore, group); 885 ext4_mb_generate_buddy(sb, data, incore, group);
886 ext4_unlock_group(sb, group);
852 incore = NULL; 887 incore = NULL;
853 } else { 888 } else {
854 /* this is block of bitmap */ 889 /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 897
863 /* mark all preallocated blks used in in-core bitmap */ 898 /* mark all preallocated blks used in in-core bitmap */
864 ext4_mb_generate_from_pa(sb, data, group); 899 ext4_mb_generate_from_pa(sb, data, group);
900 ext4_mb_generate_from_freelist(sb, data, group);
865 ext4_unlock_group(sb, group); 901 ext4_unlock_group(sb, group);
866 902
867 /* set incore so that the buddy information can be 903 /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
886ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 922ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
887 struct ext4_buddy *e4b) 923 struct ext4_buddy *e4b)
888{ 924{
889 struct ext4_sb_info *sbi = EXT4_SB(sb);
890 struct inode *inode = sbi->s_buddy_cache;
891 int blocks_per_page; 925 int blocks_per_page;
892 int block; 926 int block;
893 int pnum; 927 int pnum;
894 int poff; 928 int poff;
895 struct page *page; 929 struct page *page;
896 int ret; 930 int ret;
931 struct ext4_group_info *grp;
932 struct ext4_sb_info *sbi = EXT4_SB(sb);
933 struct inode *inode = sbi->s_buddy_cache;
897 934
898 mb_debug("load group %lu\n", group); 935 mb_debug("load group %u\n", group);
899 936
900 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 937 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
938 grp = ext4_get_group_info(sb, group);
901 939
902 e4b->bd_blkbits = sb->s_blocksize_bits; 940 e4b->bd_blkbits = sb->s_blocksize_bits;
903 e4b->bd_info = ext4_get_group_info(sb, group); 941 e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
905 e4b->bd_group = group; 943 e4b->bd_group = group;
906 e4b->bd_buddy_page = NULL; 944 e4b->bd_buddy_page = NULL;
907 e4b->bd_bitmap_page = NULL; 945 e4b->bd_bitmap_page = NULL;
946 e4b->alloc_semp = &grp->alloc_sem;
947
948 /* Take the read lock on the group alloc
949 * sem. This would make sure a parallel
950 * ext4_mb_init_group happening on other
951 * groups mapped by the page is blocked
952 * till we are done with allocation
953 */
954 down_read(e4b->alloc_semp);
908 955
909 /* 956 /*
910 * the buddy cache inode stores the block bitmap 957 * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
920 page = find_get_page(inode->i_mapping, pnum); 967 page = find_get_page(inode->i_mapping, pnum);
921 if (page == NULL || !PageUptodate(page)) { 968 if (page == NULL || !PageUptodate(page)) {
922 if (page) 969 if (page)
970 /*
971 * drop the page reference and try
972 * to get the page with lock. If we
973 * are not uptodate that implies
974 * somebody just created the page but
975 * is yet to initialize the same. So
976 * wait for it to initialize.
977 */
923 page_cache_release(page); 978 page_cache_release(page);
924 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 979 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
925 if (page) { 980 if (page) {
@@ -985,6 +1040,9 @@ err:
985 page_cache_release(e4b->bd_buddy_page); 1040 page_cache_release(e4b->bd_buddy_page);
986 e4b->bd_buddy = NULL; 1041 e4b->bd_buddy = NULL;
987 e4b->bd_bitmap = NULL; 1042 e4b->bd_bitmap = NULL;
1043
1044 /* Done with the buddy cache */
1045 up_read(e4b->alloc_semp);
988 return ret; 1046 return ret;
989} 1047}
990 1048
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
994 page_cache_release(e4b->bd_bitmap_page); 1052 page_cache_release(e4b->bd_bitmap_page);
995 if (e4b->bd_buddy_page) 1053 if (e4b->bd_buddy_page)
996 page_cache_release(e4b->bd_buddy_page); 1054 page_cache_release(e4b->bd_buddy_page);
1055 /* Done with the buddy cache */
1056 if (e4b->alloc_semp)
1057 up_read(e4b->alloc_semp);
997} 1058}
998 1059
999 1060
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 cur += 32; 1092 cur += 32;
1032 continue; 1093 continue;
1033 } 1094 }
1034 mb_clear_bit_atomic(lock, cur, bm); 1095 if (lock)
1096 mb_clear_bit_atomic(lock, cur, bm);
1097 else
1098 mb_clear_bit(cur, bm);
1035 cur++; 1099 cur++;
1036 } 1100 }
1037} 1101}
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1049 cur += 32; 1113 cur += 32;
1050 continue; 1114 continue;
1051 } 1115 }
1052 mb_set_bit_atomic(lock, cur, bm); 1116 if (lock)
1117 mb_set_bit_atomic(lock, cur, bm);
1118 else
1119 mb_set_bit(cur, bm);
1053 cur++; 1120 cur++;
1054 } 1121 }
1055} 1122}
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1094 blocknr += block; 1161 blocknr += block;
1095 blocknr += 1162 blocknr +=
1096 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1163 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1097 ext4_unlock_group(sb, e4b->bd_group); 1164 ext4_grp_locked_error(sb, e4b->bd_group,
1098 ext4_error(sb, __func__, "double-free of inode" 1165 __func__, "double-free of inode"
1099 " %lu's block %llu(bit %u in group %lu)\n", 1166 " %lu's block %llu(bit %u in group %u)",
1100 inode ? inode->i_ino : 0, blocknr, block, 1167 inode ? inode->i_ino : 0, blocknr, block,
1101 e4b->bd_group); 1168 e4b->bd_group);
1102 ext4_lock_group(sb, e4b->bd_group);
1103 } 1169 }
1104 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1170 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1105 e4b->bd_info->bb_counters[order]++; 1171 e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1296 ac->ac_tail = ret & 0xffff; 1362 ac->ac_tail = ret & 0xffff;
1297 ac->ac_buddy = ret >> 16; 1363 ac->ac_buddy = ret >> 16;
1298 1364
1299 /* XXXXXXX: SUCH A HORRIBLE **CK */ 1365 /*
1300 /*FIXME!! Why ? */ 1366 * take the page reference. We want the page to be pinned
1367 * so that we don't get a ext4_mb_init_cache_call for this
1368 * group until we update the bitmap. That would mean we
1369 * double allocate blocks. The reference is dropped
1370 * in ext4_mb_release_context
1371 */
1301 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1372 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1302 get_page(ac->ac_bitmap_page); 1373 get_page(ac->ac_bitmap_page);
1303 ac->ac_buddy_page = e4b->bd_buddy_page; 1374 ac->ac_buddy_page = e4b->bd_buddy_page;
1304 get_page(ac->ac_buddy_page); 1375 get_page(ac->ac_buddy_page);
1305 1376 /* on allocation we use ac to track the held semaphore */
1377 ac->alloc_semp = e4b->alloc_semp;
1378 e4b->alloc_semp = NULL;
1306 /* store last allocated for subsequent stream allocation */ 1379 /* store last allocated for subsequent stream allocation */
1307 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1380 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1308 spin_lock(&sbi->s_md_lock); 1381 spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1326 struct ext4_free_extent ex; 1399 struct ext4_free_extent ex;
1327 int max; 1400 int max;
1328 1401
1402 if (ac->ac_status == AC_STATUS_FOUND)
1403 return;
1329 /* 1404 /*
1330 * We don't want to scan for a whole year 1405 * We don't want to scan for a whole year
1331 */ 1406 */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1575 * free blocks even though group info says we 1650 * free blocks even though group info says we
1576 * we have free blocks 1651 * we have free blocks
1577 */ 1652 */
1578 ext4_error(sb, __func__, "%d free blocks as per " 1653 ext4_grp_locked_error(sb, e4b->bd_group,
1579 "group info. But bitmap says 0\n", 1654 __func__, "%d free blocks as per "
1655 "group info. But bitmap says 0",
1580 free); 1656 free);
1581 break; 1657 break;
1582 } 1658 }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1584 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1660 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1585 BUG_ON(ex.fe_len <= 0); 1661 BUG_ON(ex.fe_len <= 0);
1586 if (free < ex.fe_len) { 1662 if (free < ex.fe_len) {
1587 ext4_error(sb, __func__, "%d free blocks as per " 1663 ext4_grp_locked_error(sb, e4b->bd_group,
1588 "group info. But got %d blocks\n", 1664 __func__, "%d free blocks as per "
1665 "group info. But got %d blocks",
1589 free, ex.fe_len); 1666 free, ex.fe_len);
1590 /* 1667 /*
1591 * The number of free blocks differs. This mostly 1668 * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1692 return 0; 1769 return 0;
1693} 1770}
1694 1771
1772/*
1773 * lock the group_info alloc_sem of all the groups
1774 * belonging to the same buddy cache page. This
1775 * make sure other parallel operation on the buddy
1776 * cache doesn't happen whild holding the buddy cache
1777 * lock
1778 */
1779int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1780{
1781 int i;
1782 int block, pnum;
1783 int blocks_per_page;
1784 int groups_per_page;
1785 ext4_group_t first_group;
1786 struct ext4_group_info *grp;
1787
1788 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1789 /*
1790 * the buddy cache inode stores the block bitmap
1791 * and buddy information in consecutive blocks.
1792 * So for each group we need two blocks.
1793 */
1794 block = group * 2;
1795 pnum = block / blocks_per_page;
1796 first_group = pnum * blocks_per_page / 2;
1797
1798 groups_per_page = blocks_per_page >> 1;
1799 if (groups_per_page == 0)
1800 groups_per_page = 1;
1801 /* read all groups the page covers into the cache */
1802 for (i = 0; i < groups_per_page; i++) {
1803
1804 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1805 break;
1806 grp = ext4_get_group_info(sb, first_group + i);
1807 /* take all groups write allocation
1808 * semaphore. This make sure there is
1809 * no block allocation going on in any
1810 * of that groups
1811 */
1812 down_write_nested(&grp->alloc_sem, i);
1813 }
1814 return i;
1815}
1816
1817void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1818 ext4_group_t group, int locked_group)
1819{
1820 int i;
1821 int block, pnum;
1822 int blocks_per_page;
1823 ext4_group_t first_group;
1824 struct ext4_group_info *grp;
1825
1826 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1827 /*
1828 * the buddy cache inode stores the block bitmap
1829 * and buddy information in consecutive blocks.
1830 * So for each group we need two blocks.
1831 */
1832 block = group * 2;
1833 pnum = block / blocks_per_page;
1834 first_group = pnum * blocks_per_page / 2;
1835 /* release locks on all the groups */
1836 for (i = 0; i < locked_group; i++) {
1837
1838 grp = ext4_get_group_info(sb, first_group + i);
1839 /* take all groups write allocation
1840 * semaphore. This make sure there is
1841 * no block allocation going on in any
1842 * of that groups
1843 */
1844 up_write(&grp->alloc_sem);
1845 }
1846
1847}
1848
1849static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1850{
1851
1852 int ret;
1853 void *bitmap;
1854 int blocks_per_page;
1855 int block, pnum, poff;
1856 int num_grp_locked = 0;
1857 struct ext4_group_info *this_grp;
1858 struct ext4_sb_info *sbi = EXT4_SB(sb);
1859 struct inode *inode = sbi->s_buddy_cache;
1860 struct page *page = NULL, *bitmap_page = NULL;
1861
1862 mb_debug("init group %lu\n", group);
1863 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1864 this_grp = ext4_get_group_info(sb, group);
1865 /*
1866 * This ensures we don't add group
1867 * to this buddy cache via resize
1868 */
1869 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1870 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1871 /*
1872 * somebody initialized the group
1873 * return without doing anything
1874 */
1875 ret = 0;
1876 goto err;
1877 }
1878 /*
1879 * the buddy cache inode stores the block bitmap
1880 * and buddy information in consecutive blocks.
1881 * So for each group we need two blocks.
1882 */
1883 block = group * 2;
1884 pnum = block / blocks_per_page;
1885 poff = block % blocks_per_page;
1886 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1887 if (page) {
1888 BUG_ON(page->mapping != inode->i_mapping);
1889 ret = ext4_mb_init_cache(page, NULL);
1890 if (ret) {
1891 unlock_page(page);
1892 goto err;
1893 }
1894 unlock_page(page);
1895 }
1896 if (page == NULL || !PageUptodate(page)) {
1897 ret = -EIO;
1898 goto err;
1899 }
1900 mark_page_accessed(page);
1901 bitmap_page = page;
1902 bitmap = page_address(page) + (poff * sb->s_blocksize);
1903
1904 /* init buddy cache */
1905 block++;
1906 pnum = block / blocks_per_page;
1907 poff = block % blocks_per_page;
1908 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1909 if (page == bitmap_page) {
1910 /*
1911 * If both the bitmap and buddy are in
1912 * the same page we don't need to force
1913 * init the buddy
1914 */
1915 unlock_page(page);
1916 } else if (page) {
1917 BUG_ON(page->mapping != inode->i_mapping);
1918 ret = ext4_mb_init_cache(page, bitmap);
1919 if (ret) {
1920 unlock_page(page);
1921 goto err;
1922 }
1923 unlock_page(page);
1924 }
1925 if (page == NULL || !PageUptodate(page)) {
1926 ret = -EIO;
1927 goto err;
1928 }
1929 mark_page_accessed(page);
1930err:
1931 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1932 if (bitmap_page)
1933 page_cache_release(bitmap_page);
1934 if (page)
1935 page_cache_release(page);
1936 return ret;
1937}
1938
1695static noinline_for_stack int 1939static noinline_for_stack int
1696ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1940ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1697{ 1941{
@@ -1775,7 +2019,7 @@ repeat:
1775 group = 0; 2019 group = 0;
1776 2020
1777 /* quick check to skip empty groups */ 2021 /* quick check to skip empty groups */
1778 grp = ext4_get_group_info(ac->ac_sb, group); 2022 grp = ext4_get_group_info(sb, group);
1779 if (grp->bb_free == 0) 2023 if (grp->bb_free == 0)
1780 continue; 2024 continue;
1781 2025
@@ -1788,10 +2032,9 @@ repeat:
1788 * we need full data about the group 2032 * we need full data about the group
1789 * to make a good selection 2033 * to make a good selection
1790 */ 2034 */
1791 err = ext4_mb_load_buddy(sb, group, &e4b); 2035 err = ext4_mb_init_group(sb, group);
1792 if (err) 2036 if (err)
1793 goto out; 2037 goto out;
1794 ext4_mb_release_desc(&e4b);
1795 } 2038 }
1796 2039
1797 /* 2040 /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1932 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2175 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
1933 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2176 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
1934 "%-5u %-5s %-5u %-6u\n"; 2177 "%-5u %-5s %-5u %-6u\n";
1935 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2178 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1936 hs->result.fe_start, hs->result.fe_len, 2179 hs->result.fe_start, hs->result.fe_len,
1937 hs->result.fe_logical); 2180 hs->result.fe_logical);
1938 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2181 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1939 hs->orig.fe_start, hs->orig.fe_len, 2182 hs->orig.fe_start, hs->orig.fe_len,
1940 hs->orig.fe_logical); 2183 hs->orig.fe_logical);
1941 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, 2184 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
1942 hs->goal.fe_start, hs->goal.fe_len, 2185 hs->goal.fe_start, hs->goal.fe_len,
1943 hs->goal.fe_logical); 2186 hs->goal.fe_logical);
1944 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1947 hs->buddy ? 1 << hs->buddy : 0); 2190 hs->buddy ? 1 << hs->buddy : 0);
1948 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 2191 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
1949 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 2192 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
1950 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2193 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1951 hs->result.fe_start, hs->result.fe_len, 2194 hs->result.fe_start, hs->result.fe_len,
1952 hs->result.fe_logical); 2195 hs->result.fe_logical);
1953 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2196 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1954 hs->orig.fe_start, hs->orig.fe_len, 2197 hs->orig.fe_start, hs->orig.fe_len,
1955 hs->orig.fe_logical); 2198 hs->orig.fe_logical);
1956 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 2199 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
1957 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 2200 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
1958 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2201 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1959 hs->result.fe_start, hs->result.fe_len); 2202 hs->result.fe_start, hs->result.fe_len);
1960 seq_printf(seq, "%-5u %-8u %-23s discard\n", 2203 seq_printf(seq, "%-5u %-8u %-23s discard\n",
1961 hs->pid, hs->ino, buf2); 2204 hs->pid, hs->ino, buf2);
1962 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 2205 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
1963 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2206 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1964 hs->result.fe_start, hs->result.fe_len); 2207 hs->result.fe_start, hs->result.fe_len);
1965 seq_printf(seq, "%-5u %-8u %-23s free\n", 2208 seq_printf(seq, "%-5u %-8u %-23s free\n",
1966 hs->pid, hs->ino, buf2); 2209 hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2073 return NULL; 2316 return NULL;
2074 2317
2075 group = *pos + 1; 2318 group = *pos + 1;
2076 return (void *) group; 2319 return (void *) ((unsigned long) group);
2077} 2320}
2078 2321
2079static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2322static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2086 if (*pos < 0 || *pos >= sbi->s_groups_count) 2329 if (*pos < 0 || *pos >= sbi->s_groups_count)
2087 return NULL; 2330 return NULL;
2088 group = *pos + 1; 2331 group = *pos + 1;
2089 return (void *) group;; 2332 return (void *) ((unsigned long) group);
2090} 2333}
2091 2334
2092static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2335static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2093{ 2336{
2094 struct super_block *sb = seq->private; 2337 struct super_block *sb = seq->private;
2095 long group = (long) v; 2338 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2096 int i; 2339 int i;
2097 int err; 2340 int err;
2098 struct ext4_buddy e4b; 2341 struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2114 sizeof(struct ext4_group_info); 2357 sizeof(struct ext4_group_info);
2115 err = ext4_mb_load_buddy(sb, group, &e4b); 2358 err = ext4_mb_load_buddy(sb, group, &e4b);
2116 if (err) { 2359 if (err) {
2117 seq_printf(seq, "#%-5lu: I/O error\n", group); 2360 seq_printf(seq, "#%-5u: I/O error\n", group);
2118 return 0; 2361 return 0;
2119 } 2362 }
2120 ext4_lock_group(sb, group); 2363 ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2122 ext4_unlock_group(sb, group); 2365 ext4_unlock_group(sb, group);
2123 ext4_mb_release_desc(&e4b); 2366 ext4_mb_release_desc(&e4b);
2124 2367
2125 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, 2368 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2126 sg.info.bb_fragments, sg.info.bb_first_free); 2369 sg.info.bb_fragments, sg.info.bb_first_free);
2127 for (i = 0; i <= 13; i++) 2370 for (i = 0; i <= 13; i++)
2128 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2371 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2296 ext4_free_blocks_after_init(sb, group, desc); 2539 ext4_free_blocks_after_init(sb, group, desc);
2297 } else { 2540 } else {
2298 meta_group_info[i]->bb_free = 2541 meta_group_info[i]->bb_free =
2299 le16_to_cpu(desc->bg_free_blocks_count); 2542 ext4_free_blks_count(sb, desc);
2300 } 2543 }
2301 2544
2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2545 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2546 init_rwsem(&meta_group_info[i]->alloc_sem);
2303 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2547 meta_group_info[i]->bb_free_root.rb_node = NULL;;
2304 2548
2305#ifdef DOUBLE_CHECK 2549#ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
2327} /* ext4_mb_add_groupinfo */ 2571} /* ext4_mb_add_groupinfo */
2328 2572
2329/* 2573/*
2330 * Add a group to the existing groups.
2331 * This function is used for online resize
2332 */
2333int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2334 struct ext4_group_desc *desc)
2335{
2336 struct ext4_sb_info *sbi = EXT4_SB(sb);
2337 struct inode *inode = sbi->s_buddy_cache;
2338 int blocks_per_page;
2339 int block;
2340 int pnum;
2341 struct page *page;
2342 int err;
2343
2344 /* Add group based on group descriptor*/
2345 err = ext4_mb_add_groupinfo(sb, group, desc);
2346 if (err)
2347 return err;
2348
2349 /*
2350 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2351 * datas) are set not up to date so that they will be re-initilaized
2352 * during the next call to ext4_mb_load_buddy
2353 */
2354
2355 /* Set buddy page as not up to date */
2356 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2357 block = group * 2;
2358 pnum = block / blocks_per_page;
2359 page = find_get_page(inode->i_mapping, pnum);
2360 if (page != NULL) {
2361 ClearPageUptodate(page);
2362 page_cache_release(page);
2363 }
2364
2365 /* Set bitmap page as not up to date */
2366 block++;
2367 pnum = block / blocks_per_page;
2368 page = find_get_page(inode->i_mapping, pnum);
2369 if (page != NULL) {
2370 ClearPageUptodate(page);
2371 page_cache_release(page);
2372 }
2373
2374 return 0;
2375}
2376
2377/*
2378 * Update an existing group. 2574 * Update an existing group.
2379 * This function is used for online resize 2575 * This function is used for online resize
2380 */ 2576 */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2457 desc = ext4_get_group_desc(sb, i, NULL); 2653 desc = ext4_get_group_desc(sb, i, NULL);
2458 if (desc == NULL) { 2654 if (desc == NULL) {
2459 printk(KERN_ERR 2655 printk(KERN_ERR
2460 "EXT4-fs: can't read descriptor %lu\n", i); 2656 "EXT4-fs: can't read descriptor %u\n", i);
2461 goto err_freebuddy; 2657 goto err_freebuddy;
2462 } 2658 }
2463 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2659 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2493 if (sbi->s_mb_offsets == NULL) { 2689 if (sbi->s_mb_offsets == NULL) {
2494 return -ENOMEM; 2690 return -ENOMEM;
2495 } 2691 }
2692
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2496 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2497 if (sbi->s_mb_maxs == NULL) { 2695 if (sbi->s_mb_maxs == NULL) {
2498 kfree(sbi->s_mb_maxs); 2696 kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2551 ext4_mb_init_per_dev_proc(sb); 2749 ext4_mb_init_per_dev_proc(sb);
2552 ext4_mb_history_init(sb); 2750 ext4_mb_history_init(sb);
2553 2751
2554 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2752 if (sbi->s_journal)
2753 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2555 2754
2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2755 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2557 return 0; 2756 return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2652 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2851 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2653 entry = list_entry(l, struct ext4_free_data, list); 2852 entry = list_entry(l, struct ext4_free_data, list);
2654 2853
2655 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2854 mb_debug("gonna free %u blocks in group %u (0x%p):",
2656 entry->count, entry->group, entry); 2855 entry->count, entry->group, entry);
2657 2856
2658 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2857 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2679 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2878 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2680 + entry->start_blk 2879 + entry->start_blk
2681 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2880 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2682 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, 2881 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
2683 (unsigned long long) discard_block, entry->count); 2882 sb->s_id, (unsigned long long) discard_block,
2883 entry->count);
2684 sb_issue_discard(sb, discard_block, entry->count); 2884 sb_issue_discard(sb, discard_block, entry->count);
2685 2885
2686 kmem_cache_free(ext4_free_ext_cachep, entry); 2886 kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
2791 */ 2991 */
2792static noinline_for_stack int 2992static noinline_for_stack int
2793ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2993ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2794 handle_t *handle, unsigned long reserv_blks) 2994 handle_t *handle, unsigned int reserv_blks)
2795{ 2995{
2796 struct buffer_head *bitmap_bh = NULL; 2996 struct buffer_head *bitmap_bh = NULL;
2797 struct ext4_super_block *es; 2997 struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2824 if (!gdp) 3024 if (!gdp)
2825 goto out_err; 3025 goto out_err;
2826 3026
2827 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2828 gdp->bg_free_blocks_count); 3028 gdp->bg_free_blocks_count);
2829 3029
2830 err = ext4_journal_get_write_access(handle, gdp_bh); 3030 err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2843 in_range(block + len - 1, ext4_inode_table(sb, gdp), 3043 in_range(block + len - 1, ext4_inode_table(sb, gdp),
2844 EXT4_SB(sb)->s_itb_per_group)) { 3044 EXT4_SB(sb)->s_itb_per_group)) {
2845 ext4_error(sb, __func__, 3045 ext4_error(sb, __func__,
2846 "Allocating block in system zone - block = %llu", 3046 "Allocating block %llu in system zone of %d group\n",
2847 block); 3047 block, ac->ac_b_ex.fe_group);
2848 /* File system mounted not to panic on error 3048 /* File system mounted not to panic on error
2849 * Fix the bitmap and repeat the block allocation 3049 * Fix the bitmap and repeat the block allocation
2850 * We leak some of the blocks here. 3050 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2852 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 3052 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
2853 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3053 bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2854 ac->ac_b_ex.fe_len); 3054 ac->ac_b_ex.fe_len);
2855 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3055 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2856 if (!err) 3056 if (!err)
2857 err = -EAGAIN; 3057 err = -EAGAIN;
2858 goto out_err; 3058 goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2866 } 3066 }
2867 } 3067 }
2868#endif 3068#endif
2869 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2870 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2871
2872 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3069 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3070 mb_set_bits(NULL, bitmap_bh->b_data,
3071 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2873 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3072 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2874 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3073 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2875 gdp->bg_free_blocks_count = 3074 ext4_free_blks_set(sb, gdp,
2876 cpu_to_le16(ext4_free_blocks_after_init(sb, 3075 ext4_free_blocks_after_init(sb,
2877 ac->ac_b_ex.fe_group, 3076 ac->ac_b_ex.fe_group, gdp));
2878 gdp));
2879 } 3077 }
2880 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 3078 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
3079 ext4_free_blks_set(sb, gdp, len);
2881 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3080 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2882 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3081 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2883 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3082 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2899 spin_unlock(sb_bgl_lock(sbi, flex_group)); 3098 spin_unlock(sb_bgl_lock(sbi, flex_group));
2900 } 3099 }
2901 3100
2902 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3101 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2903 if (err) 3102 if (err)
2904 goto out_err; 3103 goto out_err;
2905 err = ext4_journal_dirty_metadata(handle, gdp_bh); 3104 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2906 3105
2907out_err: 3106out_err:
2908 sb->s_dirt = 1; 3107 sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3031 /* check we don't cross already preallocated blocks */ 3230 /* check we don't cross already preallocated blocks */
3032 rcu_read_lock(); 3231 rcu_read_lock();
3033 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3232 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3034 unsigned long pa_end; 3233 ext4_lblk_t pa_end;
3035 3234
3036 if (pa->pa_deleted) 3235 if (pa->pa_deleted)
3037 continue; 3236 continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3075 /* XXX: extra loop to check we really don't overlap preallocations */ 3274 /* XXX: extra loop to check we really don't overlap preallocations */
3076 rcu_read_lock(); 3275 rcu_read_lock();
3077 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3276 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3078 unsigned long pa_end; 3277 ext4_lblk_t pa_end;
3079 spin_lock(&pa->pa_lock); 3278 spin_lock(&pa->pa_lock);
3080 if (pa->pa_deleted == 0) { 3279 if (pa->pa_deleted == 0) {
3081 pa_end = pa->pa_lstart + pa->pa_len; 3280 pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3307} 3506}
3308 3507
3309/* 3508/*
3509 * the function goes through all block freed in the group
3510 * but not yet committed and marks them used in in-core bitmap.
3511 * buddy must be generated from this bitmap
3512 * Need to be called with ext4 group lock (ext4_lock_group)
3513 */
3514static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3515 ext4_group_t group)
3516{
3517 struct rb_node *n;
3518 struct ext4_group_info *grp;
3519 struct ext4_free_data *entry;
3520
3521 grp = ext4_get_group_info(sb, group);
3522 n = rb_first(&(grp->bb_free_root));
3523
3524 while (n) {
3525 entry = rb_entry(n, struct ext4_free_data, node);
3526 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3527 bitmap, entry->start_blk,
3528 entry->count);
3529 n = rb_next(n);
3530 }
3531 return;
3532}
3533
3534/*
3310 * the function goes through all preallocation in this group and marks them 3535 * the function goes through all preallocation in this group and marks them
3311 * used in in-core bitmap. buddy must be generated from this bitmap 3536 * used in in-core bitmap. buddy must be generated from this bitmap
3312 * Need to be called with ext4 group lock (ext4_lock_group) 3537 * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3346 preallocated += len; 3571 preallocated += len;
3347 count++; 3572 count++;
3348 } 3573 }
3349 mb_debug("prellocated %u for group %lu\n", preallocated, group); 3574 mb_debug("prellocated %u for group %u\n", preallocated, group);
3350} 3575}
3351 3576
3352static void ext4_mb_pa_callback(struct rcu_head *head) 3577static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
3363static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3588static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3364 struct super_block *sb, struct ext4_prealloc_space *pa) 3589 struct super_block *sb, struct ext4_prealloc_space *pa)
3365{ 3590{
3366 unsigned long grp; 3591 ext4_group_t grp;
3367 3592
3368 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3593 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3369 return; 3594 return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3473 3698
3474 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3699 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3475 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3700 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3701 trace_mark(ext4_mb_new_inode_pa,
3702 "dev %s ino %lu pstart %llu len %u lstart %u",
3703 sb->s_id, ac->ac_inode->i_ino,
3704 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3476 3705
3477 ext4_mb_use_inode_pa(ac, pa); 3706 ext4_mb_use_inode_pa(ac, pa);
3478 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3707 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3530 pa->pa_linear = 1; 3759 pa->pa_linear = 1;
3531 3760
3532 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3761 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3533 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3762 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3763 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
3764 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3534 3765
3535 ext4_mb_use_group_pa(ac, pa); 3766 ext4_mb_use_group_pa(ac, pa);
3536 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3767 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3579{ 3810{
3580 struct super_block *sb = e4b->bd_sb; 3811 struct super_block *sb = e4b->bd_sb;
3581 struct ext4_sb_info *sbi = EXT4_SB(sb); 3812 struct ext4_sb_info *sbi = EXT4_SB(sb);
3582 unsigned long end; 3813 unsigned int end;
3583 unsigned long next; 3814 unsigned int next;
3584 ext4_group_t group; 3815 ext4_group_t group;
3585 ext4_grpblk_t bit; 3816 ext4_grpblk_t bit;
3817 unsigned long long grp_blk_start;
3586 sector_t start; 3818 sector_t start;
3587 int err = 0; 3819 int err = 0;
3588 int free = 0; 3820 int free = 0;
3589 3821
3590 BUG_ON(pa->pa_deleted == 0); 3822 BUG_ON(pa->pa_deleted == 0);
3591 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3823 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3824 grp_blk_start = pa->pa_pstart - bit;
3592 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3825 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3593 end = bit + pa->pa_len; 3826 end = bit + pa->pa_len;
3594 3827
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3618 ext4_mb_store_history(ac); 3851 ext4_mb_store_history(ac);
3619 } 3852 }
3620 3853
3854 trace_mark(ext4_mb_release_inode_pa,
3855 "dev %s ino %lu block %llu count %u",
3856 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
3857 next - bit);
3621 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3858 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3622 bit = next + 1; 3859 bit = next + 1;
3623 } 3860 }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3626 pa, (unsigned long) pa->pa_lstart, 3863 pa, (unsigned long) pa->pa_lstart,
3627 (unsigned long) pa->pa_pstart, 3864 (unsigned long) pa->pa_pstart,
3628 (unsigned long) pa->pa_len); 3865 (unsigned long) pa->pa_len);
3629 ext4_error(sb, __func__, "free %u, pa_free %u\n", 3866 ext4_grp_locked_error(sb, group,
3630 free, pa->pa_free); 3867 __func__, "free %u, pa_free %u",
3868 free, pa->pa_free);
3631 /* 3869 /*
3632 * pa is already deleted so we use the value obtained 3870 * pa is already deleted so we use the value obtained
3633 * from the bitmap and continue. 3871 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3650 if (ac) 3888 if (ac)
3651 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3889 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3652 3890
3891 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
3892 sb->s_id, pa->pa_pstart, pa->pa_len);
3653 BUG_ON(pa->pa_deleted == 0); 3893 BUG_ON(pa->pa_deleted == 0);
3654 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3894 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3655 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3895 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3692 int busy = 0; 3932 int busy = 0;
3693 int free = 0; 3933 int free = 0;
3694 3934
3695 mb_debug("discard preallocation for group %lu\n", group); 3935 mb_debug("discard preallocation for group %u\n", group);
3696 3936
3697 if (list_empty(&grp->bb_prealloc_list)) 3937 if (list_empty(&grp->bb_prealloc_list))
3698 return 0; 3938 return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3700 bitmap_bh = ext4_read_block_bitmap(sb, group); 3940 bitmap_bh = ext4_read_block_bitmap(sb, group);
3701 if (bitmap_bh == NULL) { 3941 if (bitmap_bh == NULL) {
3702 ext4_error(sb, __func__, "Error in reading block " 3942 ext4_error(sb, __func__, "Error in reading block "
3703 "bitmap for %lu\n", group); 3943 "bitmap for %u", group);
3704 return 0; 3944 return 0;
3705 } 3945 }
3706 3946
3707 err = ext4_mb_load_buddy(sb, group, &e4b); 3947 err = ext4_mb_load_buddy(sb, group, &e4b);
3708 if (err) { 3948 if (err) {
3709 ext4_error(sb, __func__, "Error in loading buddy " 3949 ext4_error(sb, __func__, "Error in loading buddy "
3710 "information for %lu\n", group); 3950 "information for %u", group);
3711 put_bh(bitmap_bh); 3951 put_bh(bitmap_bh);
3712 return 0; 3952 return 0;
3713 } 3953 }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
3815 } 4055 }
3816 4056
3817 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4057 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
4058 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
4059 inode->i_ino);
3818 4060
3819 INIT_LIST_HEAD(&list); 4061 INIT_LIST_HEAD(&list);
3820 4062
@@ -3874,14 +4116,14 @@ repeat:
3874 err = ext4_mb_load_buddy(sb, group, &e4b); 4116 err = ext4_mb_load_buddy(sb, group, &e4b);
3875 if (err) { 4117 if (err) {
3876 ext4_error(sb, __func__, "Error in loading buddy " 4118 ext4_error(sb, __func__, "Error in loading buddy "
3877 "information for %lu\n", group); 4119 "information for %u", group);
3878 continue; 4120 continue;
3879 } 4121 }
3880 4122
3881 bitmap_bh = ext4_read_block_bitmap(sb, group); 4123 bitmap_bh = ext4_read_block_bitmap(sb, group);
3882 if (bitmap_bh == NULL) { 4124 if (bitmap_bh == NULL) {
3883 ext4_error(sb, __func__, "Error in reading block " 4125 ext4_error(sb, __func__, "Error in reading block "
3884 "bitmap for %lu\n", group); 4126 "bitmap for %u", group);
3885 ext4_mb_release_desc(&e4b); 4127 ext4_mb_release_desc(&e4b);
3886 continue; 4128 continue;
3887 } 4129 }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4024 struct ext4_sb_info *sbi = EXT4_SB(sb); 4266 struct ext4_sb_info *sbi = EXT4_SB(sb);
4025 struct ext4_super_block *es = sbi->s_es; 4267 struct ext4_super_block *es = sbi->s_es;
4026 ext4_group_t group; 4268 ext4_group_t group;
4027 unsigned long len; 4269 unsigned int len;
4028 unsigned long goal; 4270 ext4_fsblk_t goal;
4029 ext4_grpblk_t block; 4271 ext4_grpblk_t block;
4030 4272
4031 /* we can't allocate > group size */ 4273 /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4068 ac->ac_pa = NULL; 4310 ac->ac_pa = NULL;
4069 ac->ac_bitmap_page = NULL; 4311 ac->ac_bitmap_page = NULL;
4070 ac->ac_buddy_page = NULL; 4312 ac->ac_buddy_page = NULL;
4313 ac->alloc_semp = NULL;
4071 ac->ac_lg = NULL; 4314 ac->ac_lg = NULL;
4072 4315
4073 /* we have to define context: we'll we work with a file or 4316 /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4146 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4389 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4147 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4390 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4148 ext4_error(sb, __func__, "Error in loading buddy " 4391 ext4_error(sb, __func__, "Error in loading buddy "
4149 "information for %lu\n", group); 4392 "information for %u", group);
4150 continue; 4393 continue;
4151 } 4394 }
4152 ext4_lock_group(sb, group); 4395 ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4248 } 4491 }
4249 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4492 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4250 } 4493 }
4494 if (ac->alloc_semp)
4495 up_read(ac->alloc_semp);
4251 if (ac->ac_bitmap_page) 4496 if (ac->ac_bitmap_page)
4252 page_cache_release(ac->ac_bitmap_page); 4497 page_cache_release(ac->ac_bitmap_page);
4253 if (ac->ac_buddy_page) 4498 if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4264 int ret; 4509 int ret;
4265 int freed = 0; 4510 int freed = 0;
4266 4511
4512 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
4513 sb->s_id, needed);
4267 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4514 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4268 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4515 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4269 freed += ret; 4516 freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4286 struct ext4_sb_info *sbi; 4533 struct ext4_sb_info *sbi;
4287 struct super_block *sb; 4534 struct super_block *sb;
4288 ext4_fsblk_t block = 0; 4535 ext4_fsblk_t block = 0;
4289 unsigned long inquota; 4536 unsigned int inquota;
4290 unsigned long reserv_blks = 0; 4537 unsigned int reserv_blks = 0;
4291 4538
4292 sb = ar->inode->i_sb; 4539 sb = ar->inode->i_sb;
4293 sbi = EXT4_SB(sb); 4540 sbi = EXT4_SB(sb);
4294 4541
4542 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
4543 "lblk %llu goal %llu lleft %llu lright %llu "
4544 "pleft %llu pright %llu ",
4545 sb->s_id, ar->flags, ar->len,
4546 ar->inode ? ar->inode->i_ino : 0,
4547 (unsigned long long) ar->logical,
4548 (unsigned long long) ar->goal,
4549 (unsigned long long) ar->lleft,
4550 (unsigned long long) ar->lright,
4551 (unsigned long long) ar->pleft,
4552 (unsigned long long) ar->pright);
4553
4295 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4554 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4296 /* 4555 /*
4297 * With delalloc we already reserved the blocks 4556 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 } 4572 }
4314 if (ar->len == 0) { 4573 if (ar->len == 0) {
4315 *errp = -EDQUOT; 4574 *errp = -EDQUOT;
4316 return 0; 4575 goto out3;
4317 } 4576 }
4318 inquota = ar->len; 4577 inquota = ar->len;
4319 4578
@@ -4348,10 +4607,14 @@ repeat:
4348 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4607 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4349 ext4_mb_new_preallocation(ac); 4608 ext4_mb_new_preallocation(ac);
4350 } 4609 }
4351
4352 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4610 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4353 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4611 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4354 if (*errp == -EAGAIN) { 4612 if (*errp == -EAGAIN) {
4613 /*
4614 * drop the reference that we took
4615 * in ext4_mb_use_best_found
4616 */
4617 ext4_mb_release_context(ac);
4355 ac->ac_b_ex.fe_group = 0; 4618 ac->ac_b_ex.fe_group = 0;
4356 ac->ac_b_ex.fe_start = 0; 4619 ac->ac_b_ex.fe_start = 0;
4357 ac->ac_b_ex.fe_len = 0; 4620 ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
4382out1: 4645out1:
4383 if (ar->len < inquota) 4646 if (ar->len < inquota)
4384 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4647 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4648out3:
4649 if (!ar->len) {
4650 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4651 /* release all the reserved blocks if non delalloc */
4652 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4653 reserv_blks);
4654 }
4655
4656 trace_mark(ext4_allocate_blocks,
4657 "dev %s block %llu flags %u len %u ino %lu "
4658 "logical %llu goal %llu lleft %llu lright %llu "
4659 "pleft %llu pright %llu ",
4660 sb->s_id, (unsigned long long) block,
4661 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
4662 (unsigned long long) ar->logical,
4663 (unsigned long long) ar->goal,
4664 (unsigned long long) ar->lleft,
4665 (unsigned long long) ar->lright,
4666 (unsigned long long) ar->pleft,
4667 (unsigned long long) ar->pright);
4385 4668
4386 return block; 4669 return block;
4387} 4670}
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
4403 4686
4404static noinline_for_stack int 4687static noinline_for_stack int
4405ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4688ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4406 ext4_group_t group, ext4_grpblk_t block, int count) 4689 struct ext4_free_data *new_entry)
4407{ 4690{
4691 ext4_grpblk_t block;
4692 struct ext4_free_data *entry;
4408 struct ext4_group_info *db = e4b->bd_info; 4693 struct ext4_group_info *db = e4b->bd_info;
4409 struct super_block *sb = e4b->bd_sb; 4694 struct super_block *sb = e4b->bd_sb;
4410 struct ext4_sb_info *sbi = EXT4_SB(sb); 4695 struct ext4_sb_info *sbi = EXT4_SB(sb);
4411 struct ext4_free_data *entry, *new_entry;
4412 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4696 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4413 struct rb_node *parent = NULL, *new_node; 4697 struct rb_node *parent = NULL, *new_node;
4414 4698
4415 4699 BUG_ON(!ext4_handle_valid(handle));
4416 BUG_ON(e4b->bd_bitmap_page == NULL); 4700 BUG_ON(e4b->bd_bitmap_page == NULL);
4417 BUG_ON(e4b->bd_buddy_page == NULL); 4701 BUG_ON(e4b->bd_buddy_page == NULL);
4418 4702
4419 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4420 new_entry->start_blk = block;
4421 new_entry->group = group;
4422 new_entry->count = count;
4423 new_entry->t_tid = handle->h_transaction->t_tid;
4424 new_node = &new_entry->node; 4703 new_node = &new_entry->node;
4704 block = new_entry->start_blk;
4425 4705
4426 ext4_lock_group(sb, group);
4427 if (!*n) { 4706 if (!*n) {
4428 /* first free block exent. We need to 4707 /* first free block exent. We need to
4429 protect buddy cache from being freed, 4708 protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4441 else if (block >= (entry->start_blk + entry->count)) 4720 else if (block >= (entry->start_blk + entry->count))
4442 n = &(*n)->rb_right; 4721 n = &(*n)->rb_right;
4443 else { 4722 else {
4444 ext4_unlock_group(sb, group); 4723 ext4_grp_locked_error(sb, e4b->bd_group, __func__,
4445 ext4_error(sb, __func__, 4724 "Double free of blocks %d (%d %d)",
4446 "Double free of blocks %d (%d %d)\n", 4725 block, entry->start_blk, entry->count);
4447 block, entry->start_blk, entry->count);
4448 return 0; 4726 return 0;
4449 } 4727 }
4450 } 4728 }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4483 spin_lock(&sbi->s_md_lock); 4761 spin_lock(&sbi->s_md_lock);
4484 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4762 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4485 spin_unlock(&sbi->s_md_lock); 4763 spin_unlock(&sbi->s_md_lock);
4486 ext4_unlock_group(sb, group);
4487 return 0; 4764 return 0;
4488} 4765}
4489 4766
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4499 struct ext4_allocation_context *ac = NULL; 4776 struct ext4_allocation_context *ac = NULL;
4500 struct ext4_group_desc *gdp; 4777 struct ext4_group_desc *gdp;
4501 struct ext4_super_block *es; 4778 struct ext4_super_block *es;
4502 unsigned long overflow; 4779 unsigned int overflow;
4503 ext4_grpblk_t bit; 4780 ext4_grpblk_t bit;
4504 struct buffer_head *gd_bh; 4781 struct buffer_head *gd_bh;
4505 ext4_group_t block_group; 4782 ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4522 } 4799 }
4523 4800
4524 ext4_debug("freeing block %lu\n", block); 4801 ext4_debug("freeing block %lu\n", block);
4802 trace_mark(ext4_free_blocks,
4803 "dev %s block %llu count %lu metadata %d ino %lu",
4804 sb->s_id, (unsigned long long) block, count, metadata,
4805 inode ? inode->i_ino : 0);
4525 4806
4526 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4807 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4527 if (ac) { 4808 if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
4581 err = ext4_journal_get_write_access(handle, gd_bh); 4862 err = ext4_journal_get_write_access(handle, gd_bh);
4582 if (err) 4863 if (err)
4583 goto error_return; 4864 goto error_return;
4584
4585 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4586 if (err)
4587 goto error_return;
4588
4589#ifdef AGGRESSIVE_CHECK 4865#ifdef AGGRESSIVE_CHECK
4590 { 4866 {
4591 int i; 4867 int i;
@@ -4593,13 +4869,6 @@ do_more:
4593 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4869 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4594 } 4870 }
4595#endif 4871#endif
4596 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4597 bit, count);
4598
4599 /* We dirtied the bitmap block */
4600 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4601 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4602
4603 if (ac) { 4872 if (ac) {
4604 ac->ac_b_ex.fe_group = block_group; 4873 ac->ac_b_ex.fe_group = block_group;
4605 ac->ac_b_ex.fe_start = bit; 4874 ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
4607 ext4_mb_store_history(ac); 4876 ext4_mb_store_history(ac);
4608 } 4877 }
4609 4878
4610 if (metadata) { 4879 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4611 /* blocks being freed are metadata. these blocks shouldn't 4880 if (err)
4612 * be used until this transaction is committed */ 4881 goto error_return;
4613 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4882 if (metadata && ext4_handle_valid(handle)) {
4883 struct ext4_free_data *new_entry;
4884 /*
4885 * blocks being freed are metadata. these blocks shouldn't
4886 * be used until this transaction is committed
4887 */
4888 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4889 new_entry->start_blk = bit;
4890 new_entry->group = block_group;
4891 new_entry->count = count;
4892 new_entry->t_tid = handle->h_transaction->t_tid;
4893 ext4_lock_group(sb, block_group);
4894 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4895 bit, count);
4896 ext4_mb_free_metadata(handle, &e4b, new_entry);
4897 ext4_unlock_group(sb, block_group);
4614 } else { 4898 } else {
4615 ext4_lock_group(sb, block_group); 4899 ext4_lock_group(sb, block_group);
4900 /* need to update group_info->bb_free and bitmap
4901 * with group lock held. generate_buddy look at
4902 * them with group lock_held
4903 */
4904 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4905 bit, count);
4616 mb_free_blocks(inode, &e4b, bit, count); 4906 mb_free_blocks(inode, &e4b, bit, count);
4617 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4907 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4618 ext4_unlock_group(sb, block_group); 4908 ext4_unlock_group(sb, block_group);
4619 } 4909 }
4620 4910
4621 spin_lock(sb_bgl_lock(sbi, block_group)); 4911 spin_lock(sb_bgl_lock(sbi, block_group));
4622 le16_add_cpu(&gdp->bg_free_blocks_count, count); 4912 ret = ext4_free_blks_count(sb, gdp) + count;
4913 ext4_free_blks_set(sb, gdp, ret);
4623 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4914 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4624 spin_unlock(sb_bgl_lock(sbi, block_group)); 4915 spin_unlock(sb_bgl_lock(sbi, block_group));
4625 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4916 percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
4635 4926
4636 *freed += count; 4927 *freed += count;
4637 4928
4929 /* We dirtied the bitmap block */
4930 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4931 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4932
4638 /* And the group descriptor block */ 4933 /* And the group descriptor block */
4639 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4934 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4640 ret = ext4_journal_dirty_metadata(handle, gd_bh); 4935 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4641 if (!err) 4936 if (!err)
4642 err = ret; 4937 err = ret;
4643 4938
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/marker.h> 22#include <linux/marker.h>
23#include <linux/mutex.h>
23#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
24#include "ext4.h" 25#include "ext4.h"
25#include "group.h" 26#include "group.h"
@@ -98,9 +99,6 @@
98 */ 99 */
99#define MB_DEFAULT_GROUP_PREALLOC 512 100#define MB_DEFAULT_GROUP_PREALLOC 512
100 101
101static struct kmem_cache *ext4_pspace_cachep;
102static struct kmem_cache *ext4_ac_cachep;
103static struct kmem_cache *ext4_free_ext_cachep;
104 102
105struct ext4_free_data { 103struct ext4_free_data {
106 /* this links the free block information from group_info */ 104 /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
120 tid_t t_tid; 118 tid_t t_tid;
121}; 119};
122 120
123struct ext4_group_info {
124 unsigned long bb_state;
125 struct rb_root bb_free_root;
126 unsigned short bb_first_free;
127 unsigned short bb_free;
128 unsigned short bb_fragments;
129 struct list_head bb_prealloc_list;
130#ifdef DOUBLE_CHECK
131 void *bb_bitmap;
132#endif
133 unsigned short bb_counters[];
134};
135
136#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
137#define EXT4_GROUP_INFO_LOCKED_BIT 1
138
139#define EXT4_MB_GRP_NEED_INIT(grp) \
140 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
141
142
143struct ext4_prealloc_space { 121struct ext4_prealloc_space {
144 struct list_head pa_inode_list; 122 struct list_head pa_inode_list;
145 struct list_head pa_group_list; 123 struct list_head pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
217 __u8 ac_op; /* operation, for history only */ 195 __u8 ac_op; /* operation, for history only */
218 struct page *ac_bitmap_page; 196 struct page *ac_bitmap_page;
219 struct page *ac_buddy_page; 197 struct page *ac_buddy_page;
198 /*
199 * pointer to the held semaphore upon successful
200 * block allocation
201 */
202 struct rw_semaphore *alloc_semp;
220 struct ext4_prealloc_space *ac_pa; 203 struct ext4_prealloc_space *ac_pa;
221 struct ext4_locality_group *ac_lg; 204 struct ext4_locality_group *ac_lg;
222}; 205};
@@ -250,6 +233,7 @@ struct ext4_buddy {
250 struct super_block *bd_sb; 233 struct super_block *bd_sb;
251 __u16 bd_blkbits; 234 __u16 bd_blkbits;
252 ext4_group_t bd_group; 235 ext4_group_t bd_group;
236 struct rw_semaphore *alloc_semp;
253}; 237};
254#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 238#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
255#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 239#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
259{ 243{
260 return; 244 return;
261} 245}
262#else
263static void ext4_mb_store_history(struct ext4_allocation_context *ac);
264#endif 246#endif
265 247
266#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
267 249
268struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
269 251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
270static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
271 ext4_group_t group);
272static void ext4_mb_return_to_preallocation(struct inode *inode,
273 struct ext4_buddy *e4b, sector_t block,
274 int count);
275static void ext4_mb_put_pa(struct ext4_allocation_context *,
276 struct super_block *, struct ext4_prealloc_space *pa);
277static int ext4_mb_init_per_dev_proc(struct super_block *sb);
278static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
279static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
280
281
282static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
283{
284 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
285
286 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
287}
288
289static inline void ext4_unlock_group(struct super_block *sb,
290 ext4_group_t group)
291{
292 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
293
294 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
295}
296
297static inline int ext4_is_group_locked(struct super_block *sb,
298 ext4_group_t group)
299{
300 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
301
302 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
303 &(grinfo->bb_state));
304}
305
306static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
307 struct ext4_free_extent *fex) 252 struct ext4_free_extent *fex)
308{ 253{
309 ext4_fsblk_t block; 254 ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
59 /* 59 /*
60 * Make sure the credit we accumalated is not really high 60 * Make sure the credit we accumalated is not really high
61 */ 61 */
62 if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { 62 if (needed && ext4_handle_has_enough_credits(handle,
63 EXT4_RESERVE_TRANS_BLOCKS)) {
63 retval = ext4_journal_restart(handle, needed); 64 retval = ext4_journal_restart(handle, needed);
64 if (retval) 65 if (retval)
65 goto err_out; 66 goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
229{ 230{
230 int retval = 0, needed; 231 int retval = 0, needed;
231 232
232 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 233 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
233 return 0; 234 return 0;
234 /* 235 /*
235 * We are freeing a blocks. During this we touch 236 * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
458 struct list_blocks_struct lb; 459 struct list_blocks_struct lb;
459 unsigned long max_entries; 460 unsigned long max_entries;
460 461
461 if (!test_opt(inode->i_sb, EXTENTS)) 462 /*
462 /* 463 * If the filesystem does not support extents, or the inode
463 * if mounted with noextents we don't allow the migrate 464 * already is extent-based, error out.
464 */ 465 */
465 return -EINVAL; 466 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
466 467 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
467 if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 468 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
468 return -EINVAL; 469 return -EINVAL;
469 470
470 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 471 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9fd2a5e1be4d..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
372 goto fail; 368 goto fail;
373 } 369 }
374 hinfo->hash_version = root->info.hash_version; 370 hinfo->hash_version = root->info.hash_version;
371 if (hinfo->hash_version <= DX_HASH_TEA)
372 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 373 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
376 if (d_name) 374 if (d_name)
377 ext4fs_dirhash(d_name->name, d_name->len, hinfo); 375 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
641 dir = dir_file->f_path.dentry->d_inode; 639 dir = dir_file->f_path.dentry->d_inode;
642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 640 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 641 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
642 if (hinfo.hash_version <= DX_HASH_TEA)
643 hinfo.hash_version +=
644 EXT4_SB(dir->i_sb)->s_hash_unsigned;
644 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 645 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
645 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 646 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
646 start_hash, start_minor_hash); 647 start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
806static inline int search_dirblock(struct buffer_head *bh, 807static inline int search_dirblock(struct buffer_head *bh,
807 struct inode *dir, 808 struct inode *dir,
808 const struct qstr *d_name, 809 const struct qstr *d_name,
809 unsigned long offset, 810 unsigned int offset,
810 struct ext4_dir_entry_2 ** res_dir) 811 struct ext4_dir_entry_2 ** res_dir)
811{ 812{
812 struct ext4_dir_entry_2 * de; 813 struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1043 bh = ext4_find_entry(dir, &dentry->d_name, &de); 1044 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1044 inode = NULL; 1045 inode = NULL;
1045 if (bh) { 1046 if (bh) {
1046 unsigned long ino = le32_to_cpu(de->inode); 1047 __u32 ino = le32_to_cpu(de->inode);
1047 brelse(bh); 1048 brelse(bh);
1048 if (!ext4_valid_inum(dir->i_sb, ino)) { 1049 if (!ext4_valid_inum(dir->i_sb, ino)) {
1049 ext4_error(dir->i_sb, "ext4_lookup", 1050 ext4_error(dir->i_sb, "ext4_lookup",
1050 "bad inode number: %lu", ino); 1051 "bad inode number: %u", ino);
1051 return ERR_PTR(-EIO); 1052 return ERR_PTR(-EIO);
1052 } 1053 }
1053 inode = ext4_iget(dir->i_sb, ino); 1054 inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1060 1061
1061struct dentry *ext4_get_parent(struct dentry *child) 1062struct dentry *ext4_get_parent(struct dentry *child)
1062{ 1063{
1063 unsigned long ino; 1064 __u32 ino;
1064 struct inode *inode; 1065 struct inode *inode;
1065 static const struct qstr dotdot = { 1066 static const struct qstr dotdot = {
1066 .name = "..", 1067 .name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1078 1079
1079 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1080 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1080 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1081 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1081 "bad inode number: %lu", ino); 1082 "bad inode number: %u", ino);
1082 return ERR_PTR(-EIO); 1083 return ERR_PTR(-EIO);
1083 } 1084 }
1084 1085
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1166 u32 hash2; 1167 u32 hash2;
1167 struct dx_map_entry *map; 1168 struct dx_map_entry *map;
1168 char *data1 = (*bh)->b_data, *data2; 1169 char *data1 = (*bh)->b_data, *data2;
1169 unsigned split, move, size, i; 1170 unsigned split, move, size;
1170 struct ext4_dir_entry_2 *de = NULL, *de2; 1171 struct ext4_dir_entry_2 *de = NULL, *de2;
1171 int err = 0; 1172 int err = 0, i;
1172 1173
1173 bh2 = ext4_append (handle, dir, &newblock, &err); 1174 bh2 = ext4_append (handle, dir, &newblock, &err);
1174 if (!(bh2)) { 1175 if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1228 de = de2; 1229 de = de2;
1229 } 1230 }
1230 dx_insert_block(frame, hash2 + continued, newblock); 1231 dx_insert_block(frame, hash2 + continued, newblock);
1231 err = ext4_journal_dirty_metadata(handle, bh2); 1232 err = ext4_handle_dirty_metadata(handle, dir, bh2);
1232 if (err) 1233 if (err)
1233 goto journal_error; 1234 goto journal_error;
1234 err = ext4_journal_dirty_metadata(handle, frame->bh); 1235 err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
1235 if (err) 1236 if (err)
1236 goto journal_error; 1237 goto journal_error;
1237 brelse(bh2); 1238 brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1266 struct inode *dir = dentry->d_parent->d_inode; 1267 struct inode *dir = dentry->d_parent->d_inode;
1267 const char *name = dentry->d_name.name; 1268 const char *name = dentry->d_name.name;
1268 int namelen = dentry->d_name.len; 1269 int namelen = dentry->d_name.len;
1269 unsigned long offset = 0; 1270 unsigned int offset = 0;
1270 unsigned short reclen; 1271 unsigned short reclen;
1271 int nlen, rlen, err; 1272 int nlen, rlen, err;
1272 char *top; 1273 char *top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1335 ext4_update_dx_flag(dir); 1336 ext4_update_dx_flag(dir);
1336 dir->i_version++; 1337 dir->i_version++;
1337 ext4_mark_inode_dirty(handle, dir); 1338 ext4_mark_inode_dirty(handle, dir);
1338 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1339 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1339 err = ext4_journal_dirty_metadata(handle, bh); 1340 err = ext4_handle_dirty_metadata(handle, dir, bh);
1340 if (err) 1341 if (err)
1341 ext4_std_error(dir->i_sb, err); 1342 ext4_std_error(dir->i_sb, err);
1342 brelse(bh); 1343 brelse(bh);
@@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1408 1409
1409 /* Initialize as for dx_probe */ 1410 /* Initialize as for dx_probe */
1410 hinfo.hash_version = root->info.hash_version; 1411 hinfo.hash_version = root->info.hash_version;
1412 if (hinfo.hash_version <= DX_HASH_TEA)
1413 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1411 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 1414 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1412 ext4fs_dirhash(name, namelen, &hinfo); 1415 ext4fs_dirhash(name, namelen, &hinfo);
1413 frame = frames; 1416 frame = frames;
@@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1437 struct inode *inode) 1440 struct inode *inode)
1438{ 1441{
1439 struct inode *dir = dentry->d_parent->d_inode; 1442 struct inode *dir = dentry->d_parent->d_inode;
1440 unsigned long offset;
1441 struct buffer_head *bh; 1443 struct buffer_head *bh;
1442 struct ext4_dir_entry_2 *de; 1444 struct ext4_dir_entry_2 *de;
1443 struct super_block *sb; 1445 struct super_block *sb;
@@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1459 ext4_mark_inode_dirty(handle, dir); 1461 ext4_mark_inode_dirty(handle, dir);
1460 } 1462 }
1461 blocks = dir->i_size >> sb->s_blocksize_bits; 1463 blocks = dir->i_size >> sb->s_blocksize_bits;
1462 for (block = 0, offset = 0; block < blocks; block++) { 1464 for (block = 0; block < blocks; block++) {
1463 bh = ext4_bread(handle, dir, block, 0, &retval); 1465 bh = ext4_bread(handle, dir, block, 0, &retval);
1464 if(!bh) 1466 if(!bh)
1465 return retval; 1467 return retval;
@@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1574 dxtrace(dx_show_index("node", frames[1].entries)); 1576 dxtrace(dx_show_index("node", frames[1].entries));
1575 dxtrace(dx_show_index("node", 1577 dxtrace(dx_show_index("node",
1576 ((struct dx_node *) bh2->b_data)->entries)); 1578 ((struct dx_node *) bh2->b_data)->entries));
1577 err = ext4_journal_dirty_metadata(handle, bh2); 1579 err = ext4_handle_dirty_metadata(handle, inode, bh2);
1578 if (err) 1580 if (err)
1579 goto journal_error; 1581 goto journal_error;
1580 brelse (bh2); 1582 brelse (bh2);
@@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1600 if (err) 1602 if (err)
1601 goto journal_error; 1603 goto journal_error;
1602 } 1604 }
1603 ext4_journal_dirty_metadata(handle, frames[0].bh); 1605 ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1604 } 1606 }
1605 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1607 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1606 if (!de) 1608 if (!de)
@@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
1646 else 1648 else
1647 de->inode = 0; 1649 de->inode = 0;
1648 dir->i_version++; 1650 dir->i_version++;
1649 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1651 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1650 ext4_journal_dirty_metadata(handle, bh); 1652 ext4_handle_dirty_metadata(handle, dir, bh);
1651 return 0; 1653 return 0;
1652 } 1654 }
1653 i += ext4_rec_len_from_disk(de->rec_len); 1655 i += ext4_rec_len_from_disk(de->rec_len);
@@ -1725,7 +1727,7 @@ retry:
1725 return PTR_ERR(handle); 1727 return PTR_ERR(handle);
1726 1728
1727 if (IS_DIRSYNC(dir)) 1729 if (IS_DIRSYNC(dir))
1728 handle->h_sync = 1; 1730 ext4_handle_sync(handle);
1729 1731
1730 inode = ext4_new_inode (handle, dir, mode); 1732 inode = ext4_new_inode (handle, dir, mode);
1731 err = PTR_ERR(inode); 1733 err = PTR_ERR(inode);
@@ -1759,7 +1761,7 @@ retry:
1759 return PTR_ERR(handle); 1761 return PTR_ERR(handle);
1760 1762
1761 if (IS_DIRSYNC(dir)) 1763 if (IS_DIRSYNC(dir))
1762 handle->h_sync = 1; 1764 ext4_handle_sync(handle);
1763 1765
1764 inode = ext4_new_inode(handle, dir, mode); 1766 inode = ext4_new_inode(handle, dir, mode);
1765 err = PTR_ERR(inode); 1767 err = PTR_ERR(inode);
@@ -1795,7 +1797,7 @@ retry:
1795 return PTR_ERR(handle); 1797 return PTR_ERR(handle);
1796 1798
1797 if (IS_DIRSYNC(dir)) 1799 if (IS_DIRSYNC(dir))
1798 handle->h_sync = 1; 1800 ext4_handle_sync(handle);
1799 1801
1800 inode = ext4_new_inode(handle, dir, S_IFDIR | mode); 1802 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1801 err = PTR_ERR(inode); 1803 err = PTR_ERR(inode);
@@ -1824,8 +1826,8 @@ retry:
1824 strcpy(de->name, ".."); 1826 strcpy(de->name, "..");
1825 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1827 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1826 inode->i_nlink = 2; 1828 inode->i_nlink = 2;
1827 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1829 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1828 ext4_journal_dirty_metadata(handle, dir_block); 1830 ext4_handle_dirty_metadata(handle, dir, dir_block);
1829 brelse(dir_block); 1831 brelse(dir_block);
1830 ext4_mark_inode_dirty(handle, inode); 1832 ext4_mark_inode_dirty(handle, inode);
1831 err = ext4_add_entry(handle, dentry, inode); 1833 err = ext4_add_entry(handle, dentry, inode);
@@ -1854,7 +1856,7 @@ out_stop:
1854 */ 1856 */
1855static int empty_dir(struct inode *inode) 1857static int empty_dir(struct inode *inode)
1856{ 1858{
1857 unsigned long offset; 1859 unsigned int offset;
1858 struct buffer_head *bh; 1860 struct buffer_head *bh;
1859 struct ext4_dir_entry_2 *de, *de1; 1861 struct ext4_dir_entry_2 *de, *de1;
1860 struct super_block *sb; 1862 struct super_block *sb;
@@ -1899,7 +1901,7 @@ static int empty_dir(struct inode *inode)
1899 if (err) 1901 if (err)
1900 ext4_error(sb, __func__, 1902 ext4_error(sb, __func__,
1901 "error %d reading directory" 1903 "error %d reading directory"
1902 " #%lu offset %lu", 1904 " #%lu offset %u",
1903 err, inode->i_ino, offset); 1905 err, inode->i_ino, offset);
1904 offset += sb->s_blocksize; 1906 offset += sb->s_blocksize;
1905 continue; 1907 continue;
@@ -1937,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1937 struct ext4_iloc iloc; 1939 struct ext4_iloc iloc;
1938 int err = 0, rc; 1940 int err = 0, rc;
1939 1941
1942 if (!ext4_handle_valid(handle))
1943 return 0;
1944
1940 lock_super(sb); 1945 lock_super(sb);
1941 if (!list_empty(&EXT4_I(inode)->i_orphan)) 1946 if (!list_empty(&EXT4_I(inode)->i_orphan))
1942 goto out_unlock; 1947 goto out_unlock;
@@ -1965,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1965 /* Insert this inode at the head of the on-disk orphan list... */ 1970 /* Insert this inode at the head of the on-disk orphan list... */
1966 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 1971 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1967 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 1972 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1968 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1973 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
1969 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 1974 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1970 if (!err) 1975 if (!err)
1971 err = rc; 1976 err = rc;
@@ -1999,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
1999 struct list_head *prev; 2004 struct list_head *prev;
2000 struct ext4_inode_info *ei = EXT4_I(inode); 2005 struct ext4_inode_info *ei = EXT4_I(inode);
2001 struct ext4_sb_info *sbi; 2006 struct ext4_sb_info *sbi;
2002 unsigned long ino_next; 2007 __u32 ino_next;
2003 struct ext4_iloc iloc; 2008 struct ext4_iloc iloc;
2004 int err = 0; 2009 int err = 0;
2005 2010
2011 if (!ext4_handle_valid(handle))
2012 return 0;
2013
2006 lock_super(inode->i_sb); 2014 lock_super(inode->i_sb);
2007 if (list_empty(&ei->i_orphan)) { 2015 if (list_empty(&ei->i_orphan)) {
2008 unlock_super(inode->i_sb); 2016 unlock_super(inode->i_sb);
@@ -2021,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2021 * transaction handle with which to update the orphan list on 2029 * transaction handle with which to update the orphan list on
2022 * disk, but we still need to remove the inode from the linked 2030 * disk, but we still need to remove the inode from the linked
2023 * list in memory. */ 2031 * list in memory. */
2024 if (!handle) 2032 if (sbi->s_journal && !handle)
2025 goto out; 2033 goto out;
2026 2034
2027 err = ext4_reserve_inode_write(handle, inode, &iloc); 2035 err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2029,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2029 goto out_err; 2037 goto out_err;
2030 2038
2031 if (prev == &sbi->s_orphan) { 2039 if (prev == &sbi->s_orphan) {
2032 jbd_debug(4, "superblock will point to %lu\n", ino_next); 2040 jbd_debug(4, "superblock will point to %u\n", ino_next);
2033 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 2041 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2034 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 2042 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2035 if (err) 2043 if (err)
2036 goto out_brelse; 2044 goto out_brelse;
2037 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2045 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2038 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); 2046 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
2039 } else { 2047 } else {
2040 struct ext4_iloc iloc2; 2048 struct ext4_iloc iloc2;
2041 struct inode *i_prev = 2049 struct inode *i_prev =
2042 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; 2050 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2043 2051
2044 jbd_debug(4, "orphan inode %lu will point to %lu\n", 2052 jbd_debug(4, "orphan inode %lu will point to %u\n",
2045 i_prev->i_ino, ino_next); 2053 i_prev->i_ino, ino_next);
2046 err = ext4_reserve_inode_write(handle, i_prev, &iloc2); 2054 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2047 if (err) 2055 if (err)
@@ -2086,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2086 goto end_rmdir; 2094 goto end_rmdir;
2087 2095
2088 if (IS_DIRSYNC(dir)) 2096 if (IS_DIRSYNC(dir))
2089 handle->h_sync = 1; 2097 ext4_handle_sync(handle);
2090 2098
2091 inode = dentry->d_inode; 2099 inode = dentry->d_inode;
2092 2100
@@ -2140,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2140 return PTR_ERR(handle); 2148 return PTR_ERR(handle);
2141 2149
2142 if (IS_DIRSYNC(dir)) 2150 if (IS_DIRSYNC(dir))
2143 handle->h_sync = 1; 2151 ext4_handle_sync(handle);
2144 2152
2145 retval = -ENOENT; 2153 retval = -ENOENT;
2146 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2154 bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2197,7 +2205,7 @@ retry:
2197 return PTR_ERR(handle); 2205 return PTR_ERR(handle);
2198 2206
2199 if (IS_DIRSYNC(dir)) 2207 if (IS_DIRSYNC(dir))
2200 handle->h_sync = 1; 2208 ext4_handle_sync(handle);
2201 2209
2202 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); 2210 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2203 err = PTR_ERR(inode); 2211 err = PTR_ERR(inode);
@@ -2260,7 +2268,7 @@ retry:
2260 return PTR_ERR(handle); 2268 return PTR_ERR(handle);
2261 2269
2262 if (IS_DIRSYNC(dir)) 2270 if (IS_DIRSYNC(dir))
2263 handle->h_sync = 1; 2271 ext4_handle_sync(handle);
2264 2272
2265 inode->i_ctime = ext4_current_time(inode); 2273 inode->i_ctime = ext4_current_time(inode);
2266 ext4_inc_count(handle, inode); 2274 ext4_inc_count(handle, inode);
@@ -2309,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2309 return PTR_ERR(handle); 2317 return PTR_ERR(handle);
2310 2318
2311 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2319 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2312 handle->h_sync = 1; 2320 ext4_handle_sync(handle);
2313 2321
2314 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 2322 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2315 /* 2323 /*
@@ -2363,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2363 new_dir->i_ctime = new_dir->i_mtime = 2371 new_dir->i_ctime = new_dir->i_mtime =
2364 ext4_current_time(new_dir); 2372 ext4_current_time(new_dir);
2365 ext4_mark_inode_dirty(handle, new_dir); 2373 ext4_mark_inode_dirty(handle, new_dir);
2366 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); 2374 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2367 ext4_journal_dirty_metadata(handle, new_bh); 2375 ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2368 brelse(new_bh); 2376 brelse(new_bh);
2369 new_bh = NULL; 2377 new_bh = NULL;
2370 } 2378 }
@@ -2414,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2414 BUFFER_TRACE(dir_bh, "get_write_access"); 2422 BUFFER_TRACE(dir_bh, "get_write_access");
2415 ext4_journal_get_write_access(handle, dir_bh); 2423 ext4_journal_get_write_access(handle, dir_bh);
2416 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2424 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2417 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); 2425 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2418 ext4_journal_dirty_metadata(handle, dir_bh); 2426 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2419 ext4_dec_count(handle, old_dir); 2427 ext4_dec_count(handle, old_dir);
2420 if (new_inode) { 2428 if (new_inode) {
2421 /* checked empty_dir above, can't have another parent, 2429 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c328be5d6885 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
50 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 50 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
51 if (group != sbi->s_groups_count) 51 if (group != sbi->s_groups_count)
52 ext4_warning(sb, __func__, 52 ext4_warning(sb, __func__,
53 "Cannot add at group %u (only %lu groups)", 53 "Cannot add at group %u (only %u groups)",
54 input->group, sbi->s_groups_count); 54 input->group, sbi->s_groups_count);
55 else if (offset != 0) 55 else if (offset != 0)
56 ext4_warning(sb, __func__, "Last group not full"); 56 ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
149{ 149{
150 int err; 150 int err;
151 151
152 if (handle->h_buffer_credits >= thresh) 152 if (ext4_handle_has_enough_credits(handle, thresh))
153 return 0; 153 return 0;
154 154
155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); 155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
233 set_buffer_uptodate(gdb); 233 set_buffer_uptodate(gdb);
234 unlock_buffer(gdb); 234 unlock_buffer(gdb);
235 ext4_journal_dirty_metadata(handle, gdb); 235 ext4_handle_dirty_metadata(handle, NULL, gdb);
236 ext4_set_bit(bit, bh->b_data); 236 ext4_set_bit(bit, bh->b_data);
237 brelse(gdb); 237 brelse(gdb);
238 } 238 }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
251 err = PTR_ERR(bh); 251 err = PTR_ERR(bh);
252 goto exit_bh; 252 goto exit_bh;
253 } 253 }
254 ext4_journal_dirty_metadata(handle, gdb); 254 ext4_handle_dirty_metadata(handle, NULL, gdb);
255 ext4_set_bit(bit, bh->b_data); 255 ext4_set_bit(bit, bh->b_data);
256 brelse(gdb); 256 brelse(gdb);
257 } 257 }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
276 err = PTR_ERR(it); 276 err = PTR_ERR(it);
277 goto exit_bh; 277 goto exit_bh;
278 } 278 }
279 ext4_journal_dirty_metadata(handle, it); 279 ext4_handle_dirty_metadata(handle, NULL, it);
280 brelse(it); 280 brelse(it);
281 ext4_set_bit(bit, bh->b_data); 281 ext4_set_bit(bit, bh->b_data);
282 } 282 }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
284 if ((err = extend_or_restart_transaction(handle, 2, bh))) 284 if ((err = extend_or_restart_transaction(handle, 2, bh)))
285 goto exit_bh; 285 goto exit_bh;
286 286
287 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), 287 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
288 bh->b_data); 288 ext4_handle_dirty_metadata(handle, NULL, bh);
289 ext4_journal_dirty_metadata(handle, bh);
290 brelse(bh); 289 brelse(bh);
291
292 /* Mark unused entries in inode bitmap used */ 290 /* Mark unused entries in inode bitmap used */
293 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 291 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
294 input->inode_bitmap, input->inode_bitmap - start); 292 input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
297 goto exit_journal; 295 goto exit_journal;
298 } 296 }
299 297
300 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 298 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
301 bh->b_data); 299 bh->b_data);
302 ext4_journal_dirty_metadata(handle, bh); 300 ext4_handle_dirty_metadata(handle, NULL, bh);
303exit_bh: 301exit_bh:
304 brelse(bh); 302 brelse(bh);
305 303
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
486 * reserved inode, and will become GDT blocks (primary and backup). 484 * reserved inode, and will become GDT blocks (primary and backup).
487 */ 485 */
488 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 486 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
489 ext4_journal_dirty_metadata(handle, dind); 487 ext4_handle_dirty_metadata(handle, NULL, dind);
490 brelse(dind); 488 brelse(dind);
491 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 489 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
492 ext4_mark_iloc_dirty(handle, inode, &iloc); 490 ext4_mark_iloc_dirty(handle, inode, &iloc);
493 memset((*primary)->b_data, 0, sb->s_blocksize); 491 memset((*primary)->b_data, 0, sb->s_blocksize);
494 ext4_journal_dirty_metadata(handle, *primary); 492 ext4_handle_dirty_metadata(handle, NULL, *primary);
495 493
496 o_group_desc = EXT4_SB(sb)->s_group_desc; 494 o_group_desc = EXT4_SB(sb)->s_group_desc;
497 memcpy(n_group_desc, o_group_desc, 495 memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
502 kfree(o_group_desc); 500 kfree(o_group_desc);
503 501
504 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 502 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
505 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 503 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
506 504
507 return 0; 505 return 0;
508 506
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
618 primary[i]->b_blocknr, gdbackups, 616 primary[i]->b_blocknr, gdbackups,
619 blk + primary[i]->b_blocknr); */ 617 blk + primary[i]->b_blocknr); */
620 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); 618 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
621 err2 = ext4_journal_dirty_metadata(handle, primary[i]); 619 err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
622 if (!err) 620 if (!err)
623 err = err2; 621 err = err2;
624 } 622 }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
676 struct buffer_head *bh; 674 struct buffer_head *bh;
677 675
678 /* Out of journal space, and can't get more - abort - so sad */ 676 /* Out of journal space, and can't get more - abort - so sad */
679 if (handle->h_buffer_credits == 0 && 677 if (ext4_handle_valid(handle) &&
678 handle->h_buffer_credits == 0 &&
680 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && 679 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
681 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 680 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
682 break; 681 break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
696 memset(bh->b_data + size, 0, rest); 695 memset(bh->b_data + size, 0, rest);
697 set_buffer_uptodate(bh); 696 set_buffer_uptodate(bh);
698 unlock_buffer(bh); 697 unlock_buffer(bh);
699 ext4_journal_dirty_metadata(handle, bh); 698 ext4_handle_dirty_metadata(handle, NULL, bh);
700 brelse(bh); 699 brelse(bh);
701 } 700 }
702 if ((err2 = ext4_journal_stop(handle)) && !err) 701 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
715exit_err: 714exit_err:
716 if (err) { 715 if (err) {
717 ext4_warning(sb, __func__, 716 ext4_warning(sb, __func__,
718 "can't update backup for group %lu (err %d), " 717 "can't update backup for group %u (err %d), "
719 "forcing fsck on next reboot", group, err); 718 "forcing fsck on next reboot", group, err);
720 sbi->s_mount_state &= ~EXT4_VALID_FS; 719 sbi->s_mount_state &= ~EXT4_VALID_FS;
721 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
747 struct inode *inode = NULL; 746 struct inode *inode = NULL;
748 handle_t *handle; 747 handle_t *handle;
749 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 750 int err, err2;
751 751
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
761 761
762 if (ext4_blocks_count(es) + input->blocks_count < 762 if (ext4_blocks_count(es) + input->blocks_count <
763 ext4_blocks_count(es)) { 763 ext4_blocks_count(es)) {
764 ext4_warning(sb, __func__, "blocks_count overflow\n"); 764 ext4_warning(sb, __func__, "blocks_count overflow");
765 return -EINVAL; 765 return -EINVAL;
766 } 766 }
767 767
768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
769 le32_to_cpu(es->s_inodes_count)) { 769 le32_to_cpu(es->s_inodes_count)) {
770 ext4_warning(sb, __func__, "inodes_count overflow\n"); 770 ext4_warning(sb, __func__, "inodes_count overflow");
771 return -EINVAL; 771 return -EINVAL;
772 } 772 }
773 773
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
787 } 787 }
788 } 788 }
789 789
790
790 if ((err = verify_group_input(sb, input))) 791 if ((err = verify_group_input(sb, input)))
791 goto exit_put; 792 goto exit_put;
792 793
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
855 * using the new disk blocks. 856 * using the new disk blocks.
856 */ 857 */
857 858
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
858 /* Update group descriptor block for new group */ 860 /* Update group descriptor block for new group */
859 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 861 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
860 gdb_off * EXT4_DESC_SIZE(sb)); 862 gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
862 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 864 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
863 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 865 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
864 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 866 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
865 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); 867 ext4_free_blks_set(sb, gdp, input->free_blocks_count);
866 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); 868 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
869 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
867 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 870 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
868 871
869 /* 872 /*
870 * We can allocate memory for mb_alloc based on the new group 873 * We can allocate memory for mb_alloc based on the new group
871 * descriptor 874 * descriptor
872 */ 875 */
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 876 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
874 if (err) 877 if (err) {
878 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
875 goto exit_journal; 879 goto exit_journal;
880 }
876 881
877 /* 882 /*
878 * Make the new blocks and inodes valid next. We do this before 883 * Make the new blocks and inodes valid next. We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
914 919
915 /* Update the global fs size fields */ 920 /* Update the global fs size fields */
916 sbi->s_groups_count++; 921 sbi->s_groups_count++;
922 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
917 923
918 ext4_journal_dirty_metadata(handle, primary); 924 ext4_handle_dirty_metadata(handle, NULL, primary);
919 925
920 /* Update the reserved block counts only once the new group is 926 /* Update the reserved block counts only once the new group is
921 * active. */ 927 * active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 EXT4_INODES_PER_GROUP(sb); 943 EXT4_INODES_PER_GROUP(sb);
938 } 944 }
939 945
940 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 946 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
941 sb->s_dirt = 1; 947 sb->s_dirt = 1;
942 948
943exit_journal: 949exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
975 struct buffer_head *bh; 981 struct buffer_head *bh;
976 handle_t *handle; 982 handle_t *handle;
977 int err; 983 int err;
978 unsigned long freed_blocks;
979 ext4_group_t group; 984 ext4_group_t group;
980 struct ext4_group_info *grp;
981 985
982 /* We don't need to worry about locking wrt other resizers just 986 /* We don't need to worry about locking wrt other resizers just
983 * yet: we're going to revalidate es->s_blocks_count after 987 * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 1001 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 1002 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 1003 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, 1004 ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
1001 "CONFIG_LBD not enabled\n");
1002 return -EINVAL; 1005 return -EINVAL;
1003 } 1006 }
1004 1007
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1071 goto exit_put; 1074 goto exit_put;
1072 } 1075 }
1073 ext4_blocks_count_set(es, o_blocks_count + add); 1076 ext4_blocks_count_set(es, o_blocks_count + add);
1074 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1077 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1075 sb->s_dirt = 1; 1078 sb->s_dirt = 1;
1076 unlock_super(sb); 1079 unlock_super(sb);
1077 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1080 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1078 o_blocks_count + add); 1081 o_blocks_count + add);
1079 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1082 /* We add the blocks to the bitmap and set the group need init bit */
1083 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1080 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1084 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1081 o_blocks_count + add); 1085 o_blocks_count + add);
1082 if ((err = ext4_journal_stop(handle))) 1086 if ((err = ext4_journal_stop(handle)))
1083 goto exit_put; 1087 goto exit_put;
1084 1088
1085 /*
1086 * Mark mballoc pages as not up to date so that they will be updated
1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1095 */
1096 {
1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 struct inode *inode = sbi->s_buddy_cache;
1099 int blocks_per_page;
1100 int block;
1101 int pnum;
1102 struct page *page;
1103
1104 /* Set buddy page as not up to date */
1105 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1106 block = group * 2;
1107 pnum = block / blocks_per_page;
1108 page = find_get_page(inode->i_mapping, pnum);
1109 if (page != NULL) {
1110 ClearPageUptodate(page);
1111 page_cache_release(page);
1112 }
1113
1114 /* Set bitmap page as not up to date */
1115 block++;
1116 pnum = block / blocks_per_page;
1117 page = find_get_page(inode->i_mapping, pnum);
1118 if (page != NULL) {
1119 ClearPageUptodate(page);
1120 page_cache_release(page);
1121 }
1122
1123 /* Get the info on the last group */
1124 grp = ext4_get_group_info(sb, group);
1125
1126 /* Update free blocks in group info */
1127 ext4_mb_update_group_info(grp, add);
1128 }
1129
1130 if (test_opt(sb, DEBUG)) 1089 if (test_opt(sb, DEBUG))
1131 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1090 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1132 ext4_blocks_count(es)); 1091 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9494bb249390..e5f06a5f045e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
51 51
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 53 unsigned long journal_devnum);
54static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_commit_super(struct super_block *sb,
55 unsigned int);
56static void ext4_commit_super(struct super_block *sb,
57 struct ext4_super_block *es, int sync); 55 struct ext4_super_block *es, int sync);
58static void ext4_mark_recovery_complete(struct super_block *sb, 56static void ext4_mark_recovery_complete(struct super_block *sb,
59 struct ext4_super_block *es); 57 struct ext4_super_block *es);
@@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
64 char nbuf[16]); 62 char nbuf[16]);
65static int ext4_remount(struct super_block *sb, int *flags, char *data); 63static int ext4_remount(struct super_block *sb, int *flags, char *data);
66static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 64static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
67static void ext4_unlockfs(struct super_block *sb); 65static int ext4_unfreeze(struct super_block *sb);
68static void ext4_write_super(struct super_block *sb); 66static void ext4_write_super(struct super_block *sb);
69static void ext4_write_super_lockfs(struct super_block *sb); 67static int ext4_freeze(struct super_block *sb);
70 68
71 69
72ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 70ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
93 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 91 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94} 92}
95 93
94__u32 ext4_free_blks_count(struct super_block *sb,
95 struct ext4_group_desc *bg)
96{
97 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
98 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
99 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
100}
101
102__u32 ext4_free_inodes_count(struct super_block *sb,
103 struct ext4_group_desc *bg)
104{
105 return le16_to_cpu(bg->bg_free_inodes_count_lo) |
106 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
107 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
108}
109
110__u32 ext4_used_dirs_count(struct super_block *sb,
111 struct ext4_group_desc *bg)
112{
113 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
114 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
115 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
116}
117
118__u32 ext4_itable_unused_count(struct super_block *sb,
119 struct ext4_group_desc *bg)
120{
121 return le16_to_cpu(bg->bg_itable_unused_lo) |
122 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
123 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
124}
125
96void ext4_block_bitmap_set(struct super_block *sb, 126void ext4_block_bitmap_set(struct super_block *sb,
97 struct ext4_group_desc *bg, ext4_fsblk_t blk) 127 struct ext4_group_desc *bg, ext4_fsblk_t blk)
98{ 128{
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
117 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 147 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
118} 148}
119 149
150void ext4_free_blks_set(struct super_block *sb,
151 struct ext4_group_desc *bg, __u32 count)
152{
153 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
154 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
155 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
156}
157
158void ext4_free_inodes_set(struct super_block *sb,
159 struct ext4_group_desc *bg, __u32 count)
160{
161 bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
162 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
163 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
164}
165
166void ext4_used_dirs_set(struct super_block *sb,
167 struct ext4_group_desc *bg, __u32 count)
168{
169 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
170 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
171 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
172}
173
174void ext4_itable_unused_set(struct super_block *sb,
175 struct ext4_group_desc *bg, __u32 count)
176{
177 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
178 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
179 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
180}
181
120/* 182/*
121 * Wrappers for jbd2_journal_start/end. 183 * Wrappers for jbd2_journal_start/end.
122 * 184 *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
136 * backs (eg. EIO in the commit thread), then we still need to 198 * backs (eg. EIO in the commit thread), then we still need to
137 * take the FS itself readonly cleanly. */ 199 * take the FS itself readonly cleanly. */
138 journal = EXT4_SB(sb)->s_journal; 200 journal = EXT4_SB(sb)->s_journal;
139 if (is_journal_aborted(journal)) { 201 if (journal) {
140 ext4_abort(sb, __func__, 202 if (is_journal_aborted(journal)) {
141 "Detected aborted journal"); 203 ext4_abort(sb, __func__,
142 return ERR_PTR(-EROFS); 204 "Detected aborted journal");
205 return ERR_PTR(-EROFS);
206 }
207 return jbd2_journal_start(journal, nblocks);
143 } 208 }
144 209 /*
145 return jbd2_journal_start(journal, nblocks); 210 * We're not journaling, return the appropriate indication.
211 */
212 current->journal_info = EXT4_NOJOURNAL_HANDLE;
213 return current->journal_info;
146} 214}
147 215
148/* 216/*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
157 int err; 225 int err;
158 int rc; 226 int rc;
159 227
228 if (!ext4_handle_valid(handle)) {
229 /*
230 * Do this here since we don't call jbd2_journal_stop() in
231 * no-journal mode.
232 */
233 current->journal_info = NULL;
234 return 0;
235 }
160 sb = handle->h_transaction->t_journal->j_private; 236 sb = handle->h_transaction->t_journal->j_private;
161 err = handle->h_err; 237 err = handle->h_err;
162 rc = jbd2_journal_stop(handle); 238 rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
174 char nbuf[16]; 250 char nbuf[16];
175 const char *errstr = ext4_decode_error(NULL, err, nbuf); 251 const char *errstr = ext4_decode_error(NULL, err, nbuf);
176 252
253 BUG_ON(!ext4_handle_valid(handle));
254
177 if (bh) 255 if (bh)
178 BUFFER_TRACE(bh, "abort"); 256 BUFFER_TRACE(bh, "abort");
179 257
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
350 va_end(args); 428 va_end(args);
351} 429}
352 430
431void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
432 const char *function, const char *fmt, ...)
433__releases(bitlock)
434__acquires(bitlock)
435{
436 va_list args;
437 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
438
439 va_start(args, fmt);
440 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
441 vprintk(fmt, args);
442 printk("\n");
443 va_end(args);
444
445 if (test_opt(sb, ERRORS_CONT)) {
446 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
447 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
448 ext4_commit_super(sb, es, 0);
449 return;
450 }
451 ext4_unlock_group(sb, grp);
452 ext4_handle_error(sb);
453 /*
454 * We only get here in the ERRORS_RO case; relocking the group
455 * may be dangerous, but nothing bad will happen since the
456 * filesystem will have already been marked read/only and the
457 * journal has been aborted. We return 1 as a hint to callers
458 * who might what to use the return value from
459 * ext4_grp_locked_error() to distinguish beween the
460 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
461 * aggressively from the ext4 function in question, with a
462 * more appropriate error code.
463 */
464 ext4_lock_group(sb, grp);
465 return;
466}
467
468
353void ext4_update_dynamic_rev(struct super_block *sb) 469void ext4_update_dynamic_rev(struct super_block *sb)
354{ 470{
355 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 471 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
389 return bdev; 505 return bdev;
390 506
391fail: 507fail:
392 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", 508 printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
393 __bdevname(dev, b), PTR_ERR(bdev)); 509 __bdevname(dev, b), PTR_ERR(bdev));
394 return NULL; 510 return NULL;
395} 511}
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
448 ext4_mb_release(sb); 564 ext4_mb_release(sb);
449 ext4_ext_release(sb); 565 ext4_ext_release(sb);
450 ext4_xattr_put_super(sb); 566 ext4_xattr_put_super(sb);
451 err = jbd2_journal_destroy(sbi->s_journal); 567 if (sbi->s_journal) {
452 sbi->s_journal = NULL; 568 err = jbd2_journal_destroy(sbi->s_journal);
453 if (err < 0) 569 sbi->s_journal = NULL;
454 ext4_abort(sb, __func__, "Couldn't clean up the journal"); 570 if (err < 0)
455 571 ext4_abort(sb, __func__,
572 "Couldn't clean up the journal");
573 }
456 if (!(sb->s_flags & MS_RDONLY)) { 574 if (!(sb->s_flags & MS_RDONLY)) {
457 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 575 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
458 es->s_state = cpu_to_le16(sbi->s_mount_state); 576 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
522 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 640 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
523 INIT_LIST_HEAD(&ei->i_prealloc_list); 641 INIT_LIST_HEAD(&ei->i_prealloc_list);
524 spin_lock_init(&ei->i_prealloc_lock); 642 spin_lock_init(&ei->i_prealloc_lock);
643 /*
644 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
645 * therefore it can be null here. Don't check it, just initialize
646 * jinode.
647 */
525 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); 648 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
526 ei->i_reserved_data_blocks = 0; 649 ei->i_reserved_data_blocks = 0;
527 ei->i_reserved_meta_blocks = 0; 650 ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
588 } 711 }
589#endif 712#endif
590 ext4_discard_preallocations(inode); 713 ext4_discard_preallocations(inode);
591 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 714 if (EXT4_JOURNAL(inode))
715 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
592 &EXT4_I(inode)->jinode); 716 &EXT4_I(inode)->jinode);
593} 717}
594 718
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
681#endif 805#endif
682 if (!test_opt(sb, RESERVATION)) 806 if (!test_opt(sb, RESERVATION))
683 seq_puts(seq, ",noreservation"); 807 seq_puts(seq, ",noreservation");
684 if (sbi->s_commit_interval) { 808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
685 seq_printf(seq, ",commit=%u", 809 seq_printf(seq, ",commit=%u",
686 (unsigned) (sbi->s_commit_interval / HZ)); 810 (unsigned) (sbi->s_commit_interval / HZ));
687 } 811 }
812 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
813 seq_printf(seq, ",min_batch_time=%u",
814 (unsigned) sbi->s_min_batch_time);
815 }
816 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
817 seq_printf(seq, ",max_batch_time=%u",
818 (unsigned) sbi->s_min_batch_time);
819 }
820
688 /* 821 /*
689 * We're changing the default of barrier mount option, so 822 * We're changing the default of barrier mount option, so
690 * let's always display its mount state so it's clear what its 823 * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
696 seq_puts(seq, ",journal_async_commit"); 829 seq_puts(seq, ",journal_async_commit");
697 if (test_opt(sb, NOBH)) 830 if (test_opt(sb, NOBH))
698 seq_puts(seq, ",nobh"); 831 seq_puts(seq, ",nobh");
699 if (!test_opt(sb, EXTENTS))
700 seq_puts(seq, ",noextents");
701 if (test_opt(sb, I_VERSION)) 832 if (test_opt(sb, I_VERSION))
702 seq_puts(seq, ",i_version"); 833 seq_puts(seq, ",i_version");
703 if (!test_opt(sb, DELALLOC)) 834 if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
772 ext4_nfs_get_inode); 903 ext4_nfs_get_inode);
773} 904}
774 905
906/*
907 * Try to release metadata pages (indirect blocks, directories) which are
908 * mapped via the block device. Since these pages could have journal heads
909 * which would prevent try_to_free_buffers() from freeing them, we must use
910 * jbd2 layer's try_to_free_buffers() function to release them.
911 */
912static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
913{
914 journal_t *journal = EXT4_SB(sb)->s_journal;
915
916 WARN_ON(PageChecked(page));
917 if (!page_has_buffers(page))
918 return 0;
919 if (journal)
920 return jbd2_journal_try_to_free_buffers(journal, page,
921 wait & ~__GFP_WAIT);
922 return try_to_free_buffers(page);
923}
924
775#ifdef CONFIG_QUOTA 925#ifdef CONFIG_QUOTA
776#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") 926#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
777#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 927#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -828,8 +978,8 @@ static const struct super_operations ext4_sops = {
828 .put_super = ext4_put_super, 978 .put_super = ext4_put_super,
829 .write_super = ext4_write_super, 979 .write_super = ext4_write_super,
830 .sync_fs = ext4_sync_fs, 980 .sync_fs = ext4_sync_fs,
831 .write_super_lockfs = ext4_write_super_lockfs, 981 .freeze_fs = ext4_freeze,
832 .unlockfs = ext4_unlockfs, 982 .unfreeze_fs = ext4_unfreeze,
833 .statfs = ext4_statfs, 983 .statfs = ext4_statfs,
834 .remount_fs = ext4_remount, 984 .remount_fs = ext4_remount,
835 .clear_inode = ext4_clear_inode, 985 .clear_inode = ext4_clear_inode,
@@ -838,6 +988,7 @@ static const struct super_operations ext4_sops = {
838 .quota_read = ext4_quota_read, 988 .quota_read = ext4_quota_read,
839 .quota_write = ext4_quota_write, 989 .quota_write = ext4_quota_write,
840#endif 990#endif
991 .bdev_try_to_free_page = bdev_try_to_free_page,
841}; 992};
842 993
843static const struct export_operations ext4_export_ops = { 994static const struct export_operations ext4_export_ops = {
@@ -852,16 +1003,17 @@ enum {
852 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1003 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
853 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1004 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
854 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1005 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
855 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 1006 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1007 Opt_journal_update, Opt_journal_dev,
856 Opt_journal_checksum, Opt_journal_async_commit, 1008 Opt_journal_checksum, Opt_journal_async_commit,
857 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1009 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
858 Opt_data_err_abort, Opt_data_err_ignore, 1010 Opt_data_err_abort, Opt_data_err_ignore,
859 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1011 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
860 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1012 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
861 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1013 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
862 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 1014 Opt_grpquota, Opt_i_version,
863 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1015 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
864 Opt_inode_readahead_blks 1016 Opt_inode_readahead_blks, Opt_journal_ioprio
865}; 1017};
866 1018
867static const match_table_t tokens = { 1019static const match_table_t tokens = {
@@ -891,8 +1043,9 @@ static const match_table_t tokens = {
891 {Opt_nobh, "nobh"}, 1043 {Opt_nobh, "nobh"},
892 {Opt_bh, "bh"}, 1044 {Opt_bh, "bh"},
893 {Opt_commit, "commit=%u"}, 1045 {Opt_commit, "commit=%u"},
1046 {Opt_min_batch_time, "min_batch_time=%u"},
1047 {Opt_max_batch_time, "max_batch_time=%u"},
894 {Opt_journal_update, "journal=update"}, 1048 {Opt_journal_update, "journal=update"},
895 {Opt_journal_inum, "journal=%u"},
896 {Opt_journal_dev, "journal_dev=%u"}, 1049 {Opt_journal_dev, "journal_dev=%u"},
897 {Opt_journal_checksum, "journal_checksum"}, 1050 {Opt_journal_checksum, "journal_checksum"},
898 {Opt_journal_async_commit, "journal_async_commit"}, 1051 {Opt_journal_async_commit, "journal_async_commit"},
@@ -913,14 +1066,13 @@ static const match_table_t tokens = {
913 {Opt_quota, "quota"}, 1066 {Opt_quota, "quota"},
914 {Opt_usrquota, "usrquota"}, 1067 {Opt_usrquota, "usrquota"},
915 {Opt_barrier, "barrier=%u"}, 1068 {Opt_barrier, "barrier=%u"},
916 {Opt_extents, "extents"},
917 {Opt_noextents, "noextents"},
918 {Opt_i_version, "i_version"}, 1069 {Opt_i_version, "i_version"},
919 {Opt_stripe, "stripe=%u"}, 1070 {Opt_stripe, "stripe=%u"},
920 {Opt_resize, "resize"}, 1071 {Opt_resize, "resize"},
921 {Opt_delalloc, "delalloc"}, 1072 {Opt_delalloc, "delalloc"},
922 {Opt_nodelalloc, "nodelalloc"}, 1073 {Opt_nodelalloc, "nodelalloc"},
923 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1074 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1075 {Opt_journal_ioprio, "journal_ioprio=%u"},
924 {Opt_err, NULL}, 1076 {Opt_err, NULL},
925}; 1077};
926 1078
@@ -945,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
945 return sb_block; 1097 return sb_block;
946} 1098}
947 1099
1100#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1101
948static int parse_options(char *options, struct super_block *sb, 1102static int parse_options(char *options, struct super_block *sb,
949 unsigned int *inum, unsigned long *journal_devnum, 1103 unsigned long *journal_devnum,
1104 unsigned int *journal_ioprio,
950 ext4_fsblk_t *n_blocks_count, int is_remount) 1105 ext4_fsblk_t *n_blocks_count, int is_remount)
951{ 1106{
952 struct ext4_sb_info *sbi = EXT4_SB(sb); 1107 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -958,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
958 int qtype, qfmt; 1113 int qtype, qfmt;
959 char *qname; 1114 char *qname;
960#endif 1115#endif
961 ext4_fsblk_t last_block;
962 1116
963 if (!options) 1117 if (!options)
964 return 1; 1118 return 1;
@@ -1070,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1070 } 1224 }
1071 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1225 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1072 break; 1226 break;
1073 case Opt_journal_inum:
1074 if (is_remount) {
1075 printk(KERN_ERR "EXT4-fs: cannot specify "
1076 "journal on remount\n");
1077 return 0;
1078 }
1079 if (match_int(&args[0], &option))
1080 return 0;
1081 *inum = option;
1082 break;
1083 case Opt_journal_dev: 1227 case Opt_journal_dev:
1084 if (is_remount) { 1228 if (is_remount) {
1085 printk(KERN_ERR "EXT4-fs: cannot specify " 1229 printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1109,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
1109 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1253 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1110 sbi->s_commit_interval = HZ * option; 1254 sbi->s_commit_interval = HZ * option;
1111 break; 1255 break;
1256 case Opt_max_batch_time:
1257 if (match_int(&args[0], &option))
1258 return 0;
1259 if (option < 0)
1260 return 0;
1261 if (option == 0)
1262 option = EXT4_DEF_MAX_BATCH_TIME;
1263 sbi->s_max_batch_time = option;
1264 break;
1265 case Opt_min_batch_time:
1266 if (match_int(&args[0], &option))
1267 return 0;
1268 if (option < 0)
1269 return 0;
1270 sbi->s_min_batch_time = option;
1271 break;
1112 case Opt_data_journal: 1272 case Opt_data_journal:
1113 data_opt = EXT4_MOUNT_JOURNAL_DATA; 1273 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1114 goto datacheck; 1274 goto datacheck;
@@ -1279,33 +1439,6 @@ set_qf_format:
1279 case Opt_bh: 1439 case Opt_bh:
1280 clear_opt(sbi->s_mount_opt, NOBH); 1440 clear_opt(sbi->s_mount_opt, NOBH);
1281 break; 1441 break;
1282 case Opt_extents:
1283 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1284 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1285 ext4_warning(sb, __func__,
1286 "extents feature not enabled "
1287 "on this filesystem, use tune2fs\n");
1288 return 0;
1289 }
1290 set_opt(sbi->s_mount_opt, EXTENTS);
1291 break;
1292 case Opt_noextents:
1293 /*
1294 * When e2fsprogs support resizing an already existing
1295 * ext3 file system to greater than 2**32 we need to
1296 * add support to block allocator to handle growing
1297 * already existing block mapped inode so that blocks
1298 * allocated for them fall within 2**32
1299 */
1300 last_block = ext4_blocks_count(sbi->s_es) - 1;
1301 if (last_block > 0xffffffffULL) {
1302 printk(KERN_ERR "EXT4-fs: Filesystem too "
1303 "large to mount with "
1304 "-o noextents options\n");
1305 return 0;
1306 }
1307 clear_opt(sbi->s_mount_opt, EXTENTS);
1308 break;
1309 case Opt_i_version: 1442 case Opt_i_version:
1310 set_opt(sbi->s_mount_opt, I_VERSION); 1443 set_opt(sbi->s_mount_opt, I_VERSION);
1311 sb->s_flags |= MS_I_VERSION; 1444 sb->s_flags |= MS_I_VERSION;
@@ -1330,6 +1463,14 @@ set_qf_format:
1330 return 0; 1463 return 0;
1331 sbi->s_inode_readahead_blks = option; 1464 sbi->s_inode_readahead_blks = option;
1332 break; 1465 break;
1466 case Opt_journal_ioprio:
1467 if (match_int(&args[0], &option))
1468 return 0;
1469 if (option < 0 || option > 7)
1470 break;
1471 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1472 option);
1473 break;
1333 default: 1474 default:
1334 printk(KERN_ERR 1475 printk(KERN_ERR
1335 "EXT4-fs: Unrecognized mount option \"%s\" " 1476 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1405,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1405 printk(KERN_WARNING 1546 printk(KERN_WARNING
1406 "EXT4-fs warning: checktime reached, " 1547 "EXT4-fs warning: checktime reached, "
1407 "running e2fsck is recommended\n"); 1548 "running e2fsck is recommended\n");
1408#if 0 1549 if (!sbi->s_journal)
1409 /* @@@ We _will_ want to clear the valid bit if we find 1550 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1410 * inconsistencies, to force a fsck at reboot. But for
1411 * a plain journaled filesystem we can keep it set as
1412 * valid forever! :)
1413 */
1414 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1415#endif
1416 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1551 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1417 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 1552 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1418 le16_add_cpu(&es->s_mnt_count, 1); 1553 le16_add_cpu(&es->s_mnt_count, 1);
1419 es->s_mtime = cpu_to_le32(get_seconds()); 1554 es->s_mtime = cpu_to_le32(get_seconds());
1420 ext4_update_dynamic_rev(sb); 1555 ext4_update_dynamic_rev(sb);
1421 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1556 if (sbi->s_journal)
1557 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1422 1558
1423 ext4_commit_super(sb, es, 1); 1559 ext4_commit_super(sb, es, 1);
1424 if (test_opt(sb, DEBUG)) 1560 if (test_opt(sb, DEBUG))
1425 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " 1561 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1426 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1562 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1427 sb->s_blocksize, 1563 sb->s_blocksize,
1428 sbi->s_groups_count, 1564 sbi->s_groups_count,
@@ -1430,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1430 EXT4_INODES_PER_GROUP(sb), 1566 EXT4_INODES_PER_GROUP(sb),
1431 sbi->s_mount_opt); 1567 sbi->s_mount_opt);
1432 1568
1433 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", 1569 if (EXT4_SB(sb)->s_journal) {
1434 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : 1570 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1435 "external", EXT4_SB(sb)->s_journal->j_devname); 1571 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1572 "external", EXT4_SB(sb)->s_journal->j_devname);
1573 } else {
1574 printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
1575 }
1436 return res; 1576 return res;
1437} 1577}
1438 1578
@@ -1444,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1444 ext4_group_t flex_group_count; 1584 ext4_group_t flex_group_count;
1445 ext4_group_t flex_group; 1585 ext4_group_t flex_group;
1446 int groups_per_flex = 0; 1586 int groups_per_flex = 0;
1447 __u64 block_bitmap = 0;
1448 int i; 1587 int i;
1449 1588
1450 if (!sbi->s_es->s_log_groups_per_flex) { 1589 if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1463,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
1463 sizeof(struct flex_groups), GFP_KERNEL); 1602 sizeof(struct flex_groups), GFP_KERNEL);
1464 if (sbi->s_flex_groups == NULL) { 1603 if (sbi->s_flex_groups == NULL) {
1465 printk(KERN_ERR "EXT4-fs: not enough memory for " 1604 printk(KERN_ERR "EXT4-fs: not enough memory for "
1466 "%lu flex groups\n", flex_group_count); 1605 "%u flex groups\n", flex_group_count);
1467 goto failed; 1606 goto failed;
1468 } 1607 }
1469 1608
1470 gdp = ext4_get_group_desc(sb, 1, &bh);
1471 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1472
1473 for (i = 0; i < sbi->s_groups_count; i++) { 1609 for (i = 0; i < sbi->s_groups_count; i++) {
1474 gdp = ext4_get_group_desc(sb, i, &bh); 1610 gdp = ext4_get_group_desc(sb, i, &bh);
1475 1611
1476 flex_group = ext4_flex_group(sbi, i); 1612 flex_group = ext4_flex_group(sbi, i);
1477 sbi->s_flex_groups[flex_group].free_inodes += 1613 sbi->s_flex_groups[flex_group].free_inodes +=
1478 le16_to_cpu(gdp->bg_free_inodes_count); 1614 ext4_free_inodes_count(sb, gdp);
1479 sbi->s_flex_groups[flex_group].free_blocks += 1615 sbi->s_flex_groups[flex_group].free_blocks +=
1480 le16_to_cpu(gdp->bg_free_blocks_count); 1616 ext4_free_blks_count(sb, gdp);
1481 } 1617 }
1482 1618
1483 return 1; 1619 return 1;
@@ -1551,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1551 block_bitmap = ext4_block_bitmap(sb, gdp); 1687 block_bitmap = ext4_block_bitmap(sb, gdp);
1552 if (block_bitmap < first_block || block_bitmap > last_block) { 1688 if (block_bitmap < first_block || block_bitmap > last_block) {
1553 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1689 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1554 "Block bitmap for group %lu not in group " 1690 "Block bitmap for group %u not in group "
1555 "(block %llu)!\n", i, block_bitmap); 1691 "(block %llu)!\n", i, block_bitmap);
1556 return 0; 1692 return 0;
1557 } 1693 }
1558 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1694 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1559 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1695 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1560 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1696 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1561 "Inode bitmap for group %lu not in group " 1697 "Inode bitmap for group %u not in group "
1562 "(block %llu)!\n", i, inode_bitmap); 1698 "(block %llu)!\n", i, inode_bitmap);
1563 return 0; 1699 return 0;
1564 } 1700 }
@@ -1566,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1566 if (inode_table < first_block || 1702 if (inode_table < first_block ||
1567 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1703 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1568 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1704 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1569 "Inode table for group %lu not in group " 1705 "Inode table for group %u not in group "
1570 "(block %llu)!\n", i, inode_table); 1706 "(block %llu)!\n", i, inode_table);
1571 return 0; 1707 return 0;
1572 } 1708 }
1573 spin_lock(sb_bgl_lock(sbi, i)); 1709 spin_lock(sb_bgl_lock(sbi, i));
1574 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1710 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1575 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1711 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1576 "Checksum for group %lu failed (%u!=%u)\n", 1712 "Checksum for group %u failed (%u!=%u)\n",
1577 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1713 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1578 gdp)), le16_to_cpu(gdp->bg_checksum)); 1714 gdp)), le16_to_cpu(gdp->bg_checksum));
1579 if (!(sb->s_flags & MS_RDONLY)) { 1715 if (!(sb->s_flags & MS_RDONLY)) {
@@ -1865,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1865 ext4_fsblk_t sb_block = get_sb_block(&data); 2001 ext4_fsblk_t sb_block = get_sb_block(&data);
1866 ext4_fsblk_t logical_sb_block; 2002 ext4_fsblk_t logical_sb_block;
1867 unsigned long offset = 0; 2003 unsigned long offset = 0;
1868 unsigned int journal_inum = 0;
1869 unsigned long journal_devnum = 0; 2004 unsigned long journal_devnum = 0;
1870 unsigned long def_mount_opts; 2005 unsigned long def_mount_opts;
1871 struct inode *root; 2006 struct inode *root;
1872 char *cp; 2007 char *cp;
2008 const char *descr;
1873 int ret = -EINVAL; 2009 int ret = -EINVAL;
1874 int blocksize; 2010 int blocksize;
1875 int db_count; 2011 unsigned int db_count;
1876 int i; 2012 unsigned int i;
1877 int needs_recovery, has_huge_files; 2013 int needs_recovery, has_huge_files;
1878 __le32 features; 2014 int features;
1879 __u64 blocks_count; 2015 __u64 blocks_count;
1880 int err; 2016 int err;
2017 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
1881 2018
1882 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2019 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1883 if (!sbi) 2020 if (!sbi)
@@ -1958,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1958 2095
1959 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2096 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1960 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2097 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
2098 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2099 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2100 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
1961 2101
1962 set_opt(sbi->s_mount_opt, RESERVATION); 2102 set_opt(sbi->s_mount_opt, RESERVATION);
1963 set_opt(sbi->s_mount_opt, BARRIER); 2103 set_opt(sbi->s_mount_opt, BARRIER);
1964 2104
1965 /* 2105 /*
1966 * turn on extents feature by default in ext4 filesystem
1967 * only if feature flag already set by mkfs or tune2fs.
1968 * Use -o noextents to turn it off
1969 */
1970 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
1971 set_opt(sbi->s_mount_opt, EXTENTS);
1972 else
1973 ext4_warning(sb, __func__,
1974 "extents feature not enabled on this filesystem, "
1975 "use tune2fs.\n");
1976
1977 /*
1978 * enable delayed allocation by default 2106 * enable delayed allocation by default
1979 * Use -o nodelalloc to turn it off 2107 * Use -o nodelalloc to turn it off
1980 */ 2108 */
1981 set_opt(sbi->s_mount_opt, DELALLOC); 2109 set_opt(sbi->s_mount_opt, DELALLOC);
1982 2110
1983 2111
1984 if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, 2112 if (!parse_options((char *) data, sb, &journal_devnum,
1985 NULL, 0)) 2113 &journal_ioprio, NULL, 0))
1986 goto failed_mount; 2114 goto failed_mount;
1987 2115
1988 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2116 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2004,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2004 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2132 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2005 if (features) { 2133 if (features) {
2006 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " 2134 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
2007 "unsupported optional features (%x).\n", 2135 "unsupported optional features (%x).\n", sb->s_id,
2008 sb->s_id, le32_to_cpu(features)); 2136 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2137 ~EXT4_FEATURE_INCOMPAT_SUPP));
2009 goto failed_mount; 2138 goto failed_mount;
2010 } 2139 }
2011 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); 2140 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2012 if (!(sb->s_flags & MS_RDONLY) && features) { 2141 if (!(sb->s_flags & MS_RDONLY) && features) {
2013 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " 2142 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
2014 "unsupported optional features (%x).\n", 2143 "unsupported optional features (%x).\n", sb->s_id,
2015 sb->s_id, le32_to_cpu(features)); 2144 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2145 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2016 goto failed_mount; 2146 goto failed_mount;
2017 } 2147 }
2018 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2148 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2117,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2117 for (i = 0; i < 4; i++) 2247 for (i = 0; i < 4; i++)
2118 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 2248 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2119 sbi->s_def_hash_version = es->s_def_hash_version; 2249 sbi->s_def_hash_version = es->s_def_hash_version;
2250 i = le32_to_cpu(es->s_flags);
2251 if (i & EXT2_FLAGS_UNSIGNED_HASH)
2252 sbi->s_hash_unsigned = 3;
2253 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2254#ifdef __CHAR_UNSIGNED__
2255 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2256 sbi->s_hash_unsigned = 3;
2257#else
2258 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2259#endif
2260 sb->s_dirt = 1;
2261 }
2120 2262
2121 if (sbi->s_blocks_per_group > blocksize * 8) { 2263 if (sbi->s_blocks_per_group > blocksize * 8) {
2122 printk(KERN_ERR 2264 printk(KERN_ERR
@@ -2144,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2144 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2286 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2145 goto cantfind_ext4; 2287 goto cantfind_ext4;
2146 2288
2147 /* ensure blocks_count calculation below doesn't sign-extend */ 2289 /*
2148 if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < 2290 * It makes no sense for the first data block to be beyond the end
2149 le32_to_cpu(es->s_first_data_block) + 1) { 2291 * of the filesystem.
2150 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " 2292 */
2151 "first data block %u, blocks per group %lu\n", 2293 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2152 ext4_blocks_count(es), 2294 printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2153 le32_to_cpu(es->s_first_data_block), 2295 "block %u is beyond end of filesystem (%llu)\n",
2154 EXT4_BLOCKS_PER_GROUP(sb)); 2296 le32_to_cpu(es->s_first_data_block),
2297 ext4_blocks_count(es));
2155 goto failed_mount; 2298 goto failed_mount;
2156 } 2299 }
2157 blocks_count = (ext4_blocks_count(es) - 2300 blocks_count = (ext4_blocks_count(es) -
2158 le32_to_cpu(es->s_first_data_block) + 2301 le32_to_cpu(es->s_first_data_block) +
2159 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2302 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2160 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 2303 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2304 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2305 printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2306 "(block count %llu, first data block %u, "
2307 "blocks per group %lu)\n", sbi->s_groups_count,
2308 ext4_blocks_count(es),
2309 le32_to_cpu(es->s_first_data_block),
2310 EXT4_BLOCKS_PER_GROUP(sb));
2311 goto failed_mount;
2312 }
2161 sbi->s_groups_count = blocks_count; 2313 sbi->s_groups_count = blocks_count;
2162 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2314 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2163 EXT4_DESC_PER_BLOCK(sb); 2315 EXT4_DESC_PER_BLOCK(sb);
@@ -2269,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2269 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2421 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2270 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2422 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2271 ext4_commit_super(sb, es, 1); 2423 ext4_commit_super(sb, es, 1);
2272 printk(KERN_CRIT
2273 "EXT4-fs (device %s): mount failed\n",
2274 sb->s_id);
2275 goto failed_mount4; 2424 goto failed_mount4;
2276 } 2425 }
2277 } 2426 }
2278 } else if (journal_inum) { 2427 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2279 if (ext4_create_journal(sb, es, journal_inum)) 2428 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2280 goto failed_mount3; 2429 printk(KERN_ERR "EXT4-fs: required journal recovery "
2430 "suppressed and not mounted read-only\n");
2431 goto failed_mount4;
2281 } else { 2432 } else {
2282 if (!silent) 2433 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2283 printk(KERN_ERR 2434 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2284 "ext4: No journal on filesystem on %s\n", 2435 sbi->s_journal = NULL;
2285 sb->s_id); 2436 needs_recovery = 0;
2286 goto failed_mount3; 2437 goto no_journal;
2287 } 2438 }
2288 2439
2289 if (ext4_blocks_count(es) > 0xffffffffULL && 2440 if (ext4_blocks_count(es) > 0xffffffffULL &&
2290 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2441 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2291 JBD2_FEATURE_INCOMPAT_64BIT)) { 2442 JBD2_FEATURE_INCOMPAT_64BIT)) {
2292 printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); 2443 printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
2293 goto failed_mount4; 2444 goto failed_mount4;
2294 } 2445 }
2295 2446
@@ -2334,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2334 default: 2485 default:
2335 break; 2486 break;
2336 } 2487 }
2488 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2489
2490no_journal:
2337 2491
2338 if (test_opt(sb, NOBH)) { 2492 if (test_opt(sb, NOBH)) {
2339 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2493 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2419,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2419 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 2573 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2420 ext4_orphan_cleanup(sb, es); 2574 ext4_orphan_cleanup(sb, es);
2421 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 2575 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2422 if (needs_recovery) 2576 if (needs_recovery) {
2423 printk(KERN_INFO "EXT4-fs: recovery complete.\n"); 2577 printk(KERN_INFO "EXT4-fs: recovery complete.\n");
2424 ext4_mark_recovery_complete(sb, es); 2578 ext4_mark_recovery_complete(sb, es);
2425 printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", 2579 }
2426 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2580 if (EXT4_SB(sb)->s_journal) {
2427 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2581 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2428 "writeback"); 2582 descr = " journalled data mode";
2583 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2584 descr = " ordered data mode";
2585 else
2586 descr = " writeback data mode";
2587 } else
2588 descr = "out journal";
2589
2590 printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
2591 sb->s_id, descr);
2429 2592
2430 lock_kernel(); 2593 lock_kernel();
2431 return 0; 2594 return 0;
@@ -2437,8 +2600,11 @@ cantfind_ext4:
2437 goto failed_mount; 2600 goto failed_mount;
2438 2601
2439failed_mount4: 2602failed_mount4:
2440 jbd2_journal_destroy(sbi->s_journal); 2603 printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
2441 sbi->s_journal = NULL; 2604 if (sbi->s_journal) {
2605 jbd2_journal_destroy(sbi->s_journal);
2606 sbi->s_journal = NULL;
2607 }
2442failed_mount3: 2608failed_mount3:
2443 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2609 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2444 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2610 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2475,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2475{ 2641{
2476 struct ext4_sb_info *sbi = EXT4_SB(sb); 2642 struct ext4_sb_info *sbi = EXT4_SB(sb);
2477 2643
2478 if (sbi->s_commit_interval) 2644 journal->j_commit_interval = sbi->s_commit_interval;
2479 journal->j_commit_interval = sbi->s_commit_interval; 2645 journal->j_min_batch_time = sbi->s_min_batch_time;
2480 /* We could also set up an ext4-specific default for the commit 2646 journal->j_max_batch_time = sbi->s_max_batch_time;
2481 * interval here, but for now we'll just fall back to the jbd
2482 * default. */
2483 2647
2484 spin_lock(&journal->j_state_lock); 2648 spin_lock(&journal->j_state_lock);
2485 if (test_opt(sb, BARRIER)) 2649 if (test_opt(sb, BARRIER))
@@ -2499,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2499 struct inode *journal_inode; 2663 struct inode *journal_inode;
2500 journal_t *journal; 2664 journal_t *journal;
2501 2665
2666 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2667
2502 /* First, test for the existence of a valid inode on disk. Bad 2668 /* First, test for the existence of a valid inode on disk. Bad
2503 * things happen if we iget() an unused inode, as the subsequent 2669 * things happen if we iget() an unused inode, as the subsequent
2504 * iput() will try to delete it. */ 2670 * iput() will try to delete it. */
@@ -2547,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2547 struct ext4_super_block *es; 2713 struct ext4_super_block *es;
2548 struct block_device *bdev; 2714 struct block_device *bdev;
2549 2715
2716 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2717
2550 bdev = ext4_blkdev_get(j_dev); 2718 bdev = ext4_blkdev_get(j_dev);
2551 if (bdev == NULL) 2719 if (bdev == NULL)
2552 return NULL; 2720 return NULL;
2553 2721
2554 if (bd_claim(bdev, sb)) { 2722 if (bd_claim(bdev, sb)) {
2555 printk(KERN_ERR 2723 printk(KERN_ERR
2556 "EXT4: failed to claim external journal device.\n"); 2724 "EXT4-fs: failed to claim external journal device.\n");
2557 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 2725 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2558 return NULL; 2726 return NULL;
2559 } 2727 }
@@ -2634,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
2634 int err = 0; 2802 int err = 0;
2635 int really_read_only; 2803 int really_read_only;
2636 2804
2805 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2806
2637 if (journal_devnum && 2807 if (journal_devnum &&
2638 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2808 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2639 printk(KERN_INFO "EXT4-fs: external journal device major/minor " 2809 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2718,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
2718 return 0; 2888 return 0;
2719} 2889}
2720 2890
2721static int ext4_create_journal(struct super_block *sb, 2891static int ext4_commit_super(struct super_block *sb,
2722 struct ext4_super_block *es,
2723 unsigned int journal_inum)
2724{
2725 journal_t *journal;
2726 int err;
2727
2728 if (sb->s_flags & MS_RDONLY) {
2729 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2730 "create journal.\n");
2731 return -EROFS;
2732 }
2733
2734 journal = ext4_get_journal(sb, journal_inum);
2735 if (!journal)
2736 return -EINVAL;
2737
2738 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2739 journal_inum);
2740
2741 err = jbd2_journal_create(journal);
2742 if (err) {
2743 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2744 jbd2_journal_destroy(journal);
2745 return -EIO;
2746 }
2747
2748 EXT4_SB(sb)->s_journal = journal;
2749
2750 ext4_update_dynamic_rev(sb);
2751 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2752 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2753
2754 es->s_journal_inum = cpu_to_le32(journal_inum);
2755 sb->s_dirt = 1;
2756
2757 /* Make sure we flush the recovery flag to disk. */
2758 ext4_commit_super(sb, es, 1);
2759
2760 return 0;
2761}
2762
2763static void ext4_commit_super(struct super_block *sb,
2764 struct ext4_super_block *es, int sync) 2892 struct ext4_super_block *es, int sync)
2765{ 2893{
2766 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 2894 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2895 int error = 0;
2767 2896
2768 if (!sbh) 2897 if (!sbh)
2769 return; 2898 return error;
2770 if (buffer_write_io_error(sbh)) { 2899 if (buffer_write_io_error(sbh)) {
2771 /* 2900 /*
2772 * Oh, dear. A previous attempt to write the 2901 * Oh, dear. A previous attempt to write the
@@ -2776,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb,
2776 * be remapped. Nothing we can do but to retry the 2905 * be remapped. Nothing we can do but to retry the
2777 * write and hope for the best. 2906 * write and hope for the best.
2778 */ 2907 */
2779 printk(KERN_ERR "ext4: previous I/O error to " 2908 printk(KERN_ERR "EXT4-fs: previous I/O error to "
2780 "superblock detected for %s.\n", sb->s_id); 2909 "superblock detected for %s.\n", sb->s_id);
2781 clear_buffer_write_io_error(sbh); 2910 clear_buffer_write_io_error(sbh);
2782 set_buffer_uptodate(sbh); 2911 set_buffer_uptodate(sbh);
2783 } 2912 }
2784 es->s_wtime = cpu_to_le32(get_seconds()); 2913 es->s_wtime = cpu_to_le32(get_seconds());
2785 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2914 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2786 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2915 &EXT4_SB(sb)->s_freeblocks_counter));
2916 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
2917 &EXT4_SB(sb)->s_freeinodes_counter));
2918
2787 BUFFER_TRACE(sbh, "marking dirty"); 2919 BUFFER_TRACE(sbh, "marking dirty");
2788 mark_buffer_dirty(sbh); 2920 mark_buffer_dirty(sbh);
2789 if (sync) { 2921 if (sync) {
2790 sync_dirty_buffer(sbh); 2922 error = sync_dirty_buffer(sbh);
2791 if (buffer_write_io_error(sbh)) { 2923 if (error)
2792 printk(KERN_ERR "ext4: I/O error while writing " 2924 return error;
2925
2926 error = buffer_write_io_error(sbh);
2927 if (error) {
2928 printk(KERN_ERR "EXT4-fs: I/O error while writing "
2793 "superblock for %s.\n", sb->s_id); 2929 "superblock for %s.\n", sb->s_id);
2794 clear_buffer_write_io_error(sbh); 2930 clear_buffer_write_io_error(sbh);
2795 set_buffer_uptodate(sbh); 2931 set_buffer_uptodate(sbh);
2796 } 2932 }
2797 } 2933 }
2934 return error;
2798} 2935}
2799 2936
2800 2937
@@ -2808,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2808{ 2945{
2809 journal_t *journal = EXT4_SB(sb)->s_journal; 2946 journal_t *journal = EXT4_SB(sb)->s_journal;
2810 2947
2948 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2949 BUG_ON(journal != NULL);
2950 return;
2951 }
2811 jbd2_journal_lock_updates(journal); 2952 jbd2_journal_lock_updates(journal);
2812 if (jbd2_journal_flush(journal) < 0) 2953 if (jbd2_journal_flush(journal) < 0)
2813 goto out; 2954 goto out;
@@ -2837,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
2837 int j_errno; 2978 int j_errno;
2838 const char *errstr; 2979 const char *errstr;
2839 2980
2981 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2982
2840 journal = EXT4_SB(sb)->s_journal; 2983 journal = EXT4_SB(sb)->s_journal;
2841 2984
2842 /* 2985 /*
@@ -2869,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
2869int ext4_force_commit(struct super_block *sb) 3012int ext4_force_commit(struct super_block *sb)
2870{ 3013{
2871 journal_t *journal; 3014 journal_t *journal;
2872 int ret; 3015 int ret = 0;
2873 3016
2874 if (sb->s_flags & MS_RDONLY) 3017 if (sb->s_flags & MS_RDONLY)
2875 return 0; 3018 return 0;
2876 3019
2877 journal = EXT4_SB(sb)->s_journal; 3020 journal = EXT4_SB(sb)->s_journal;
2878 sb->s_dirt = 0; 3021 if (journal) {
2879 ret = ext4_journal_force_commit(journal); 3022 sb->s_dirt = 0;
3023 ret = ext4_journal_force_commit(journal);
3024 }
3025
2880 return ret; 3026 return ret;
2881} 3027}
2882 3028
@@ -2888,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb)
2888 */ 3034 */
2889static void ext4_write_super(struct super_block *sb) 3035static void ext4_write_super(struct super_block *sb)
2890{ 3036{
2891 if (mutex_trylock(&sb->s_lock) != 0) 3037 if (EXT4_SB(sb)->s_journal) {
2892 BUG(); 3038 if (mutex_trylock(&sb->s_lock) != 0)
2893 sb->s_dirt = 0; 3039 BUG();
3040 sb->s_dirt = 0;
3041 } else {
3042 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3043 }
2894} 3044}
2895 3045
2896static int ext4_sync_fs(struct super_block *sb, int wait) 3046static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2899,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2899 3049
2900 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3050 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2901 sb->s_dirt = 0; 3051 sb->s_dirt = 0;
2902 if (wait) 3052 if (EXT4_SB(sb)->s_journal) {
2903 ret = ext4_force_commit(sb); 3053 if (wait)
2904 else 3054 ret = ext4_force_commit(sb);
2905 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); 3055 else
3056 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
3057 } else {
3058 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3059 }
2906 return ret; 3060 return ret;
2907} 3061}
2908 3062
@@ -2910,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2910 * LVM calls this function before a (read-only) snapshot is created. This 3064 * LVM calls this function before a (read-only) snapshot is created. This
2911 * gives us a chance to flush the journal completely and mark the fs clean. 3065 * gives us a chance to flush the journal completely and mark the fs clean.
2912 */ 3066 */
2913static void ext4_write_super_lockfs(struct super_block *sb) 3067static int ext4_freeze(struct super_block *sb)
2914{ 3068{
3069 int error = 0;
3070 journal_t *journal;
2915 sb->s_dirt = 0; 3071 sb->s_dirt = 0;
2916 3072
2917 if (!(sb->s_flags & MS_RDONLY)) { 3073 if (!(sb->s_flags & MS_RDONLY)) {
2918 journal_t *journal = EXT4_SB(sb)->s_journal; 3074 journal = EXT4_SB(sb)->s_journal;
2919 3075
2920 /* Now we set up the journal barrier. */ 3076 if (journal) {
2921 jbd2_journal_lock_updates(journal); 3077 /* Now we set up the journal barrier. */
3078 jbd2_journal_lock_updates(journal);
2922 3079
2923 /* 3080 /*
2924 * We don't want to clear needs_recovery flag when we failed 3081 * We don't want to clear needs_recovery flag when we
2925 * to flush the journal. 3082 * failed to flush the journal.
2926 */ 3083 */
2927 if (jbd2_journal_flush(journal) < 0) 3084 error = jbd2_journal_flush(journal);
2928 return; 3085 if (error < 0)
3086 goto out;
3087 }
2929 3088
2930 /* Journal blocked and flushed, clear needs_recovery flag. */ 3089 /* Journal blocked and flushed, clear needs_recovery flag. */
2931 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3090 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2932 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3091 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3092 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3093 if (error)
3094 goto out;
2933 } 3095 }
3096 return 0;
3097out:
3098 jbd2_journal_unlock_updates(journal);
3099 return error;
2934} 3100}
2935 3101
2936/* 3102/*
2937 * Called by LVM after the snapshot is done. We need to reset the RECOVER 3103 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2938 * flag here, even though the filesystem is not technically dirty yet. 3104 * flag here, even though the filesystem is not technically dirty yet.
2939 */ 3105 */
2940static void ext4_unlockfs(struct super_block *sb) 3106static int ext4_unfreeze(struct super_block *sb)
2941{ 3107{
2942 if (!(sb->s_flags & MS_RDONLY)) { 3108 if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
2943 lock_super(sb); 3109 lock_super(sb);
2944 /* Reser the needs_recovery flag before the fs is unlocked. */ 3110 /* Reser the needs_recovery flag before the fs is unlocked. */
2945 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3111 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2947,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
2947 unlock_super(sb); 3113 unlock_super(sb);
2948 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3114 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
2949 } 3115 }
3116 return 0;
2950} 3117}
2951 3118
2952static int ext4_remount(struct super_block *sb, int *flags, char *data) 3119static int ext4_remount(struct super_block *sb, int *flags, char *data)
@@ -2957,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2957 unsigned long old_sb_flags; 3124 unsigned long old_sb_flags;
2958 struct ext4_mount_options old_opts; 3125 struct ext4_mount_options old_opts;
2959 ext4_group_t g; 3126 ext4_group_t g;
3127 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
2960 int err; 3128 int err;
2961#ifdef CONFIG_QUOTA 3129#ifdef CONFIG_QUOTA
2962 int i; 3130 int i;
@@ -2968,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2968 old_opts.s_resuid = sbi->s_resuid; 3136 old_opts.s_resuid = sbi->s_resuid;
2969 old_opts.s_resgid = sbi->s_resgid; 3137 old_opts.s_resgid = sbi->s_resgid;
2970 old_opts.s_commit_interval = sbi->s_commit_interval; 3138 old_opts.s_commit_interval = sbi->s_commit_interval;
3139 old_opts.s_min_batch_time = sbi->s_min_batch_time;
3140 old_opts.s_max_batch_time = sbi->s_max_batch_time;
2971#ifdef CONFIG_QUOTA 3141#ifdef CONFIG_QUOTA
2972 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 3142 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2973 for (i = 0; i < MAXQUOTAS; i++) 3143 for (i = 0; i < MAXQUOTAS; i++)
2974 old_opts.s_qf_names[i] = sbi->s_qf_names[i]; 3144 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2975#endif 3145#endif
3146 if (sbi->s_journal && sbi->s_journal->j_task->io_context)
3147 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
2976 3148
2977 /* 3149 /*
2978 * Allow the "check" option to be passed as a remount option. 3150 * Allow the "check" option to be passed as a remount option.
2979 */ 3151 */
2980 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { 3152 if (!parse_options(data, sb, NULL, &journal_ioprio,
3153 &n_blocks_count, 1)) {
2981 err = -EINVAL; 3154 err = -EINVAL;
2982 goto restore_opts; 3155 goto restore_opts;
2983 } 3156 }
@@ -2990,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2990 3163
2991 es = sbi->s_es; 3164 es = sbi->s_es;
2992 3165
2993 ext4_init_journal_params(sb, sbi->s_journal); 3166 if (sbi->s_journal) {
3167 ext4_init_journal_params(sb, sbi->s_journal);
3168 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3169 }
2994 3170
2995 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 3171 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2996 n_blocks_count > ext4_blocks_count(es)) { 3172 n_blocks_count > ext4_blocks_count(es)) {
@@ -3019,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3019 * We have to unlock super so that we can wait for 3195 * We have to unlock super so that we can wait for
3020 * transactions. 3196 * transactions.
3021 */ 3197 */
3022 unlock_super(sb); 3198 if (sbi->s_journal) {
3023 ext4_mark_recovery_complete(sb, es); 3199 unlock_super(sb);
3024 lock_super(sb); 3200 ext4_mark_recovery_complete(sb, es);
3201 lock_super(sb);
3202 }
3025 } else { 3203 } else {
3026 __le32 ret; 3204 int ret;
3027 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3205 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3028 ~EXT4_FEATURE_RO_COMPAT_SUPP))) { 3206 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3029 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3207 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
3030 "remount RDWR because of unsupported " 3208 "remount RDWR because of unsupported "
3031 "optional features (%x).\n", 3209 "optional features (%x).\n", sb->s_id,
3032 sb->s_id, le32_to_cpu(ret)); 3210 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3211 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3033 err = -EROFS; 3212 err = -EROFS;
3034 goto restore_opts; 3213 goto restore_opts;
3035 } 3214 }
@@ -3046,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3046 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 3225 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3047 printk(KERN_ERR 3226 printk(KERN_ERR
3048 "EXT4-fs: ext4_remount: " 3227 "EXT4-fs: ext4_remount: "
3049 "Checksum for group %lu failed (%u!=%u)\n", 3228 "Checksum for group %u failed (%u!=%u)\n",
3050 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 3229 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3051 le16_to_cpu(gdp->bg_checksum)); 3230 le16_to_cpu(gdp->bg_checksum));
3052 err = -EINVAL; 3231 err = -EINVAL;
@@ -3075,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3075 * been changed by e2fsck since we originally mounted 3254 * been changed by e2fsck since we originally mounted
3076 * the partition.) 3255 * the partition.)
3077 */ 3256 */
3078 ext4_clear_journal_err(sb, es); 3257 if (sbi->s_journal)
3258 ext4_clear_journal_err(sb, es);
3079 sbi->s_mount_state = le16_to_cpu(es->s_state); 3259 sbi->s_mount_state = le16_to_cpu(es->s_state);
3080 if ((err = ext4_group_extend(sb, es, n_blocks_count))) 3260 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
3081 goto restore_opts; 3261 goto restore_opts;
@@ -3083,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3083 sb->s_flags &= ~MS_RDONLY; 3263 sb->s_flags &= ~MS_RDONLY;
3084 } 3264 }
3085 } 3265 }
3266 if (sbi->s_journal == NULL)
3267 ext4_commit_super(sb, es, 1);
3268
3086#ifdef CONFIG_QUOTA 3269#ifdef CONFIG_QUOTA
3087 /* Release old quota file names */ 3270 /* Release old quota file names */
3088 for (i = 0; i < MAXQUOTAS; i++) 3271 for (i = 0; i < MAXQUOTAS; i++)
@@ -3097,6 +3280,8 @@ restore_opts:
3097 sbi->s_resuid = old_opts.s_resuid; 3280 sbi->s_resuid = old_opts.s_resuid;
3098 sbi->s_resgid = old_opts.s_resgid; 3281 sbi->s_resgid = old_opts.s_resgid;
3099 sbi->s_commit_interval = old_opts.s_commit_interval; 3282 sbi->s_commit_interval = old_opts.s_commit_interval;
3283 sbi->s_min_batch_time = old_opts.s_min_batch_time;
3284 sbi->s_max_batch_time = old_opts.s_max_batch_time;
3100#ifdef CONFIG_QUOTA 3285#ifdef CONFIG_QUOTA
3101 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 3286 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
3102 for (i = 0; i < MAXQUOTAS; i++) { 3287 for (i = 0; i < MAXQUOTAS; i++) {
@@ -3359,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3359 * When we journal data on quota file, we have to flush journal to see 3544 * When we journal data on quota file, we have to flush journal to see
3360 * all updates to the file when we bypass pagecache... 3545 * all updates to the file when we bypass pagecache...
3361 */ 3546 */
3362 if (ext4_should_journal_data(path.dentry->d_inode)) { 3547 if (EXT4_SB(sb)->s_journal &&
3548 ext4_should_journal_data(path.dentry->d_inode)) {
3363 /* 3549 /*
3364 * We don't need to lock updates but journal_flush() could 3550 * We don't need to lock updates but journal_flush() could
3365 * otherwise be livelocked... 3551 * otherwise be livelocked...
@@ -3433,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3433 struct buffer_head *bh; 3619 struct buffer_head *bh;
3434 handle_t *handle = journal_current_handle(); 3620 handle_t *handle = journal_current_handle();
3435 3621
3436 if (!handle) { 3622 if (EXT4_SB(sb)->s_journal && !handle) {
3437 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" 3623 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3438 " cancelled because transaction is not started.\n", 3624 " cancelled because transaction is not started.\n",
3439 (unsigned long long)off, (unsigned long long)len); 3625 (unsigned long long)off, (unsigned long long)len);
@@ -3458,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3458 flush_dcache_page(bh->b_page); 3644 flush_dcache_page(bh->b_page);
3459 unlock_buffer(bh); 3645 unlock_buffer(bh);
3460 if (journal_quota) 3646 if (journal_quota)
3461 err = ext4_journal_dirty_metadata(handle, bh); 3647 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3462 else { 3648 else {
3463 /* Always do at least ordered writes for quotas */ 3649 /* Always do at least ordered writes for quotas */
3464 err = ext4_jbd2_file_inode(handle, inode); 3650 err = ext4_jbd2_file_inode(handle, inode);
@@ -3512,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3512static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, 3698static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3513 size_t cnt, loff_t *ppos) 3699 size_t cnt, loff_t *ppos)
3514{ 3700{
3515 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; 3701 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3516 char str[32]; 3702 char str[32];
3517 unsigned long value;
3518 3703
3519 if (cnt >= sizeof(str)) 3704 if (cnt >= sizeof(str))
3520 return -EINVAL; 3705 return -EINVAL;
3521 if (copy_from_user(str, buf, cnt)) 3706 if (copy_from_user(str, buf, cnt))
3522 return -EFAULT; 3707 return -EFAULT;
3523 value = simple_strtol(str, NULL, 0); 3708
3524 if (value < 0) 3709 *p = simple_strtoul(str, NULL, 0);
3525 return -ERANGE;
3526 *p = value;
3527 return cnt; 3710 return cnt;
3528} 3711}
3529 3712
@@ -3614,7 +3797,7 @@ static void __exit exit_ext4_fs(void)
3614} 3797}
3615 3798
3616MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3799MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3617MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); 3800MODULE_DESCRIPTION("Fourth Extended Filesystem");
3618MODULE_LICENSE("GPL"); 3801MODULE_LICENSE("GPL");
3619module_init(init_ext4_fs) 3802module_init(init_ext4_fs)
3620module_exit(exit_ext4_fs) 3803module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
459 sb->s_dirt = 1; 459 sb->s_dirt = 1;
460 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 460 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
461 } 461 }
462} 462}
463 463
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
488 } else { 488 } else {
489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 489 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
490 error = ext4_journal_dirty_metadata(handle, bh); 490 error = ext4_handle_dirty_metadata(handle, inode, bh);
491 if (IS_SYNC(inode)) 491 if (IS_SYNC(inode))
492 handle->h_sync = 1; 492 ext4_handle_sync(handle);
493 DQUOT_FREE_BLOCK(inode, 1); 493 DQUOT_FREE_BLOCK(inode, 1);
494 ea_bdebug(bh, "refcount now=%d; releasing", 494 ea_bdebug(bh, "refcount now=%d; releasing",
495 le32_to_cpu(BHDR(bh)->h_refcount)); 495 le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
724 if (error == -EIO) 724 if (error == -EIO)
725 goto bad_block; 725 goto bad_block;
726 if (!error) 726 if (!error)
727 error = ext4_journal_dirty_metadata(handle, 727 error = ext4_handle_dirty_metadata(handle,
728 bs->bh); 728 inode,
729 bs->bh);
729 if (error) 730 if (error)
730 goto cleanup; 731 goto cleanup;
731 goto inserted; 732 goto inserted;
@@ -794,8 +795,9 @@ inserted:
794 ea_bdebug(new_bh, "reusing; refcount now=%d", 795 ea_bdebug(new_bh, "reusing; refcount now=%d",
795 le32_to_cpu(BHDR(new_bh)->h_refcount)); 796 le32_to_cpu(BHDR(new_bh)->h_refcount));
796 unlock_buffer(new_bh); 797 unlock_buffer(new_bh);
797 error = ext4_journal_dirty_metadata(handle, 798 error = ext4_handle_dirty_metadata(handle,
798 new_bh); 799 inode,
800 new_bh);
799 if (error) 801 if (error)
800 goto cleanup_dquot; 802 goto cleanup_dquot;
801 } 803 }
@@ -810,8 +812,8 @@ inserted:
810 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 814 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode, 815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
814 goal, &error); 816 goal, NULL, &error);
815 if (error) 817 if (error)
816 goto cleanup; 818 goto cleanup;
817 ea_idebug(inode, "creating block %d", block); 819 ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
833 set_buffer_uptodate(new_bh); 835 set_buffer_uptodate(new_bh);
834 unlock_buffer(new_bh); 836 unlock_buffer(new_bh);
835 ext4_xattr_cache_insert(new_bh); 837 ext4_xattr_cache_insert(new_bh);
836 error = ext4_journal_dirty_metadata(handle, new_bh); 838 error = ext4_handle_dirty_metadata(handle,
839 inode, new_bh);
837 if (error) 840 if (error)
838 goto cleanup; 841 goto cleanup;
839 } 842 }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1040 */ 1043 */
1041 is.iloc.bh = NULL; 1044 is.iloc.bh = NULL;
1042 if (IS_SYNC(inode)) 1045 if (IS_SYNC(inode))
1043 handle->h_sync = 1; 1046 ext4_handle_sync(handle);
1044 } 1047 }
1045 1048
1046cleanup: 1049cleanup:
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 777783deddcb..320323d03479 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -211,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
211} 211}
212 212
213/** 213/**
214 * gfs2_write_super_lockfs - prevent further writes to the filesystem 214 * gfs2_freeze - prevent further writes to the filesystem
215 * @sb: the VFS structure for the filesystem 215 * @sb: the VFS structure for the filesystem
216 * 216 *
217 */ 217 */
218 218
219static void gfs2_write_super_lockfs(struct super_block *sb) 219static int gfs2_freeze(struct super_block *sb)
220{ 220{
221 struct gfs2_sbd *sdp = sb->s_fs_info; 221 struct gfs2_sbd *sdp = sb->s_fs_info;
222 int error; 222 int error;
223 223
224 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 224 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
225 return; 225 return -EINVAL;
226 226
227 for (;;) { 227 for (;;) {
228 error = gfs2_freeze_fs(sdp); 228 error = gfs2_freeze_fs(sdp);
@@ -242,17 +242,19 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
242 fs_err(sdp, "retrying...\n"); 242 fs_err(sdp, "retrying...\n");
243 msleep(1000); 243 msleep(1000);
244 } 244 }
245 return 0;
245} 246}
246 247
247/** 248/**
248 * gfs2_unlockfs - reallow writes to the filesystem 249 * gfs2_unfreeze - reallow writes to the filesystem
249 * @sb: the VFS structure for the filesystem 250 * @sb: the VFS structure for the filesystem
250 * 251 *
251 */ 252 */
252 253
253static void gfs2_unlockfs(struct super_block *sb) 254static int gfs2_unfreeze(struct super_block *sb)
254{ 255{
255 gfs2_unfreeze_fs(sb->s_fs_info); 256 gfs2_unfreeze_fs(sb->s_fs_info);
257 return 0;
256} 258}
257 259
258/** 260/**
@@ -688,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
688 .put_super = gfs2_put_super, 690 .put_super = gfs2_put_super,
689 .write_super = gfs2_write_super, 691 .write_super = gfs2_write_super,
690 .sync_fs = gfs2_sync_fs, 692 .sync_fs = gfs2_sync_fs,
691 .write_super_lockfs = gfs2_write_super_lockfs, 693 .freeze_fs = gfs2_freeze,
692 .unlockfs = gfs2_unlockfs, 694 .unfreeze_fs = gfs2_unfreeze,
693 .statfs = gfs2_statfs, 695 .statfs = gfs2_statfs,
694 .remount_fs = gfs2_remount_fs, 696 .remount_fs = gfs2_remount_fs,
695 .clear_inode = gfs2_clear_inode, 697 .clear_inode = gfs2_clear_inode,
diff --git a/fs/inode.c b/fs/inode.c
index 0013ac1af8e7..913ab2d9a5d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1139,11 +1139,16 @@ EXPORT_SYMBOL(remove_inode_hash);
1139 * I_FREEING is set so that no-one will take a new reference to the inode while 1139 * I_FREEING is set so that no-one will take a new reference to the inode while
1140 * it is being deleted. 1140 * it is being deleted.
1141 */ 1141 */
1142static void generic_delete_inode_async(void *data, async_cookie_t cookie) 1142void generic_delete_inode(struct inode *inode)
1143{ 1143{
1144 struct inode *inode = data;
1145 const struct super_operations *op = inode->i_sb->s_op; 1144 const struct super_operations *op = inode->i_sb->s_op;
1146 1145
1146 list_del_init(&inode->i_list);
1147 list_del_init(&inode->i_sb_list);
1148 inode->i_state |= I_FREEING;
1149 inodes_stat.nr_inodes--;
1150 spin_unlock(&inode_lock);
1151
1147 security_inode_delete(inode); 1152 security_inode_delete(inode);
1148 1153
1149 if (op->delete_inode) { 1154 if (op->delete_inode) {
@@ -1167,16 +1172,6 @@ static void generic_delete_inode_async(void *data, async_cookie_t cookie)
1167 destroy_inode(inode); 1172 destroy_inode(inode);
1168} 1173}
1169 1174
1170void generic_delete_inode(struct inode *inode)
1171{
1172 list_del_init(&inode->i_list);
1173 list_del_init(&inode->i_sb_list);
1174 inode->i_state |= I_FREEING;
1175 inodes_stat.nr_inodes--;
1176 spin_unlock(&inode_lock);
1177 async_schedule_special(generic_delete_inode_async, inode, &inode->i_sb->s_async_list);
1178}
1179
1180EXPORT_SYMBOL(generic_delete_inode); 1175EXPORT_SYMBOL(generic_delete_inode);
1181 1176
1182static void generic_forget_inode(struct inode *inode) 1177static void generic_forget_inode(struct inode *inode)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index cc3f1aa1cf7b..20b0a8a24c6b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -439,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
439 return error; 439 return error;
440} 440}
441 441
442static int ioctl_fsfreeze(struct file *filp)
443{
444 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
445
446 if (!capable(CAP_SYS_ADMIN))
447 return -EPERM;
448
449 /* If filesystem doesn't support freeze feature, return. */
450 if (sb->s_op->freeze_fs == NULL)
451 return -EOPNOTSUPP;
452
453 /* If a blockdevice-backed filesystem isn't specified, return. */
454 if (sb->s_bdev == NULL)
455 return -EINVAL;
456
457 /* Freeze */
458 sb = freeze_bdev(sb->s_bdev);
459 if (IS_ERR(sb))
460 return PTR_ERR(sb);
461 return 0;
462}
463
464static int ioctl_fsthaw(struct file *filp)
465{
466 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
467
468 if (!capable(CAP_SYS_ADMIN))
469 return -EPERM;
470
471 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
472 if (sb->s_bdev == NULL)
473 return -EINVAL;
474
475 /* Thaw */
476 return thaw_bdev(sb->s_bdev, sb);
477}
478
442/* 479/*
443 * When you add any new common ioctls to the switches above and below 480 * When you add any new common ioctls to the switches above and below
444 * please update compat_sys_ioctl() too. 481 * please update compat_sys_ioctl() too.
@@ -486,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
486 } else 523 } else
487 error = -ENOTTY; 524 error = -ENOTTY;
488 break; 525 break;
526
527 case FIFREEZE:
528 error = ioctl_fsfreeze(filp);
529 break;
530
531 case FITHAW:
532 error = ioctl_fsthaw(filp);
533 break;
534
489 default: 535 default:
490 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 536 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
491 error = file_ioctl(filp, cmd, arg); 537 error = file_ioctl(filp, cmd, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..1a39ac370942 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29 29
30static int set_task_ioprio(struct task_struct *task, int ioprio) 30int set_task_ioprio(struct task_struct *task, int ioprio)
31{ 31{
32 int err; 32 int err;
33 struct io_context *ioc; 33 struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
70 task_unlock(task); 70 task_unlock(task);
71 return err; 71 return err;
72} 72}
73EXPORT_SYMBOL_GPL(set_task_ioprio);
73 74
74asmlinkage long sys_ioprio_set(int which, int who, int ioprio) 75asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
75{ 76{
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
306 int flags; 306 int flags;
307 int err; 307 int err;
308 unsigned long blocknr; 308 unsigned long blocknr;
309 ktime_t start_time;
310 u64 commit_time;
309 char *tagp = NULL; 311 char *tagp = NULL;
310 journal_header_t *header; 312 journal_header_t *header;
311 journal_block_tag_t *tag = NULL; 313 journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
418 commit_transaction->t_state = T_FLUSH; 420 commit_transaction->t_state = T_FLUSH;
419 journal->j_committing_transaction = commit_transaction; 421 journal->j_committing_transaction = commit_transaction;
420 journal->j_running_transaction = NULL; 422 journal->j_running_transaction = NULL;
423 start_time = ktime_get();
421 commit_transaction->t_log_start = journal->j_head; 424 commit_transaction->t_log_start = journal->j_head;
422 wake_up(&journal->j_wait_transaction_locked); 425 wake_up(&journal->j_wait_transaction_locked);
423 spin_unlock(&journal->j_state_lock); 426 spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
913 J_ASSERT(commit_transaction == journal->j_committing_transaction); 916 J_ASSERT(commit_transaction == journal->j_committing_transaction);
914 journal->j_commit_sequence = commit_transaction->t_tid; 917 journal->j_commit_sequence = commit_transaction->t_tid;
915 journal->j_committing_transaction = NULL; 918 journal->j_committing_transaction = NULL;
919 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
920
921 /*
922 * weight the commit time higher than the average time so we don't
923 * react too strongly to vast changes in commit time
924 */
925 if (likely(journal->j_average_commit_time))
926 journal->j_average_commit_time = (commit_time*3 +
927 journal->j_average_commit_time) / 4;
928 else
929 journal->j_average_commit_time = commit_time;
930
916 spin_unlock(&journal->j_state_lock); 931 spin_unlock(&journal->j_state_lock);
917 932
918 if (commit_transaction->t_checkpoint_list == NULL && 933 if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __journal_temp_unlink_buffer(struct journal_head *jh); 30static void __journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
49{ 50{
50 transaction->t_journal = journal; 51 transaction->t_journal = journal;
51 transaction->t_state = T_RUNNING; 52 transaction->t_state = T_RUNNING;
53 transaction->t_start_time = ktime_get();
52 transaction->t_tid = journal->j_transaction_sequence++; 54 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 55 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
752 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. 754 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
753 * @handle: transaction to add buffer modifications to 755 * @handle: transaction to add buffer modifications to
754 * @bh: bh to be used for metadata writes 756 * @bh: bh to be used for metadata writes
755 * @credits: variable that will receive credits for the buffer
756 * 757 *
757 * Returns an error code or 0 on success. 758 * Returns an error code or 0 on success.
758 * 759 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
1370{ 1371{
1371 transaction_t *transaction = handle->h_transaction; 1372 transaction_t *transaction = handle->h_transaction;
1372 journal_t *journal = transaction->t_journal; 1373 journal_t *journal = transaction->t_journal;
1373 int old_handle_count, err; 1374 int err;
1374 pid_t pid; 1375 pid_t pid;
1375 1376
1376 J_ASSERT(journal_current_handle() == handle); 1377 J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
1399 * on IO anyway. Speeds up many-threaded, many-dir operations 1400 * on IO anyway. Speeds up many-threaded, many-dir operations
1400 * by 30x or more... 1401 * by 30x or more...
1401 * 1402 *
1403 * We try and optimize the sleep time against what the underlying disk
1404 * can do, instead of having a static sleep time. This is usefull for
1405 * the case where our storage is so fast that it is more optimal to go
1406 * ahead and force a flush and wait for the transaction to be committed
1407 * than it is to wait for an arbitrary amount of time for new writers to
1408 * join the transaction. We acheive this by measuring how long it takes
1409 * to commit a transaction, and compare it with how long this
1410 * transaction has been running, and if run time < commit time then we
1411 * sleep for the delta and commit. This greatly helps super fast disks
1412 * that would see slowdowns as more threads started doing fsyncs.
1413 *
1402 * But don't do this if this process was the most recent one to 1414 * But don't do this if this process was the most recent one to
1403 * perform a synchronous write. We do this to detect the case where a 1415 * perform a synchronous write. We do this to detect the case where a
1404 * single process is doing a stream of sync writes. No point in waiting 1416 * single process is doing a stream of sync writes. No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
1406 */ 1418 */
1407 pid = current->pid; 1419 pid = current->pid;
1408 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1420 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1421 u64 commit_time, trans_time;
1422
1409 journal->j_last_sync_writer = pid; 1423 journal->j_last_sync_writer = pid;
1410 do { 1424
1411 old_handle_count = transaction->t_handle_count; 1425 spin_lock(&journal->j_state_lock);
1412 schedule_timeout_uninterruptible(1); 1426 commit_time = journal->j_average_commit_time;
1413 } while (old_handle_count != transaction->t_handle_count); 1427 spin_unlock(&journal->j_state_lock);
1428
1429 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1430 transaction->t_start_time));
1431
1432 commit_time = min_t(u64, commit_time,
1433 1000*jiffies_to_usecs(1));
1434
1435 if (trans_time < commit_time) {
1436 ktime_t expires = ktime_add_ns(ktime_get(),
1437 commit_time);
1438 set_current_state(TASK_UNINTERRUPTIBLE);
1439 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1440 }
1414 } 1441 }
1415 1442
1416 current->journal_info = NULL; 1443 current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
249 return ret; 249 return ret;
250} 250}
251 251
252#define NR_BATCH 64
253
254static void 252static void
255__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 253__flush_batch(journal_t *journal, int *batch_count)
256{ 254{
257 int i; 255 int i;
258 256
259 ll_rw_block(SWRITE, *batch_count, bhs); 257 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
260 for (i = 0; i < *batch_count; i++) { 258 for (i = 0; i < *batch_count; i++) {
261 struct buffer_head *bh = bhs[i]; 259 struct buffer_head *bh = journal->j_chkpt_bhs[i];
262 clear_buffer_jwrite(bh); 260 clear_buffer_jwrite(bh);
263 BUFFER_TRACE(bh, "brelse"); 261 BUFFER_TRACE(bh, "brelse");
264 __brelse(bh); 262 __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
277 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 275 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
278 */ 276 */
279static int __process_buffer(journal_t *journal, struct journal_head *jh, 277static int __process_buffer(journal_t *journal, struct journal_head *jh,
280 struct buffer_head **bhs, int *batch_count, 278 int *batch_count, transaction_t *transaction)
281 transaction_t *transaction)
282{ 279{
283 struct buffer_head *bh = jh2bh(jh); 280 struct buffer_head *bh = jh2bh(jh);
284 int ret = 0; 281 int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
325 get_bh(bh); 322 get_bh(bh);
326 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 323 J_ASSERT_BH(bh, !buffer_jwrite(bh));
327 set_buffer_jwrite(bh); 324 set_buffer_jwrite(bh);
328 bhs[*batch_count] = bh; 325 journal->j_chkpt_bhs[*batch_count] = bh;
329 __buffer_relink_io(jh); 326 __buffer_relink_io(jh);
330 jbd_unlock_bh_state(bh); 327 jbd_unlock_bh_state(bh);
331 transaction->t_chp_stats.cs_written++; 328 transaction->t_chp_stats.cs_written++;
332 (*batch_count)++; 329 (*batch_count)++;
333 if (*batch_count == NR_BATCH) { 330 if (*batch_count == JBD2_NR_BATCH) {
334 spin_unlock(&journal->j_list_lock); 331 spin_unlock(&journal->j_list_lock);
335 __flush_batch(journal, bhs, batch_count); 332 __flush_batch(journal, batch_count);
336 ret = 1; 333 ret = 1;
337 } 334 }
338 } 335 }
@@ -388,7 +385,6 @@ restart:
388 if (journal->j_checkpoint_transactions == transaction && 385 if (journal->j_checkpoint_transactions == transaction &&
389 transaction->t_tid == this_tid) { 386 transaction->t_tid == this_tid) {
390 int batch_count = 0; 387 int batch_count = 0;
391 struct buffer_head *bhs[NR_BATCH];
392 struct journal_head *jh; 388 struct journal_head *jh;
393 int retry = 0, err; 389 int retry = 0, err;
394 390
@@ -402,7 +398,7 @@ restart:
402 retry = 1; 398 retry = 1;
403 break; 399 break;
404 } 400 }
405 retry = __process_buffer(journal, jh, bhs, &batch_count, 401 retry = __process_buffer(journal, jh, &batch_count,
406 transaction); 402 transaction);
407 if (retry < 0 && !result) 403 if (retry < 0 && !result)
408 result = retry; 404 result = retry;
@@ -419,7 +415,7 @@ restart:
419 spin_unlock(&journal->j_list_lock); 415 spin_unlock(&journal->j_list_lock);
420 retry = 1; 416 retry = 1;
421 } 417 }
422 __flush_batch(journal, bhs, &batch_count); 418 __flush_batch(journal, &batch_count);
423 } 419 }
424 420
425 if (retry) { 421 if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
686 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
687 683
688 __jbd2_journal_drop_transaction(journal, transaction); 684 __jbd2_journal_drop_transaction(journal, transaction);
685 kfree(transaction);
689 686
690 /* Just in case anybody was waiting for more transactions to be 687 /* Just in case anybody was waiting for more transactions to be
691 checkpointed... */ 688 checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
760 J_ASSERT(journal->j_running_transaction != transaction); 757 J_ASSERT(journal->j_running_transaction != transaction);
761 758
762 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 759 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
763 kfree(transaction);
764} 760}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c8a1bace685a..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/crc32.h> 25#include <linux/crc32.h>
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bio.h>
28 29
29/* 30/*
30 * Default IO end handler for temporary BJ_IO buffer_heads. 31 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
137 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
138 barrier_done = 1; 139 barrier_done = 1;
139 } 140 }
140 ret = submit_bh(WRITE, bh); 141 ret = submit_bh(WRITE_SYNC, bh);
141 if (barrier_done) 142 if (barrier_done)
142 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
143 144
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
158 lock_buffer(bh); 159 lock_buffer(bh);
159 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
160 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
161 ret = submit_bh(WRITE, bh); 162 ret = submit_bh(WRITE_SYNC, bh);
162 } 163 }
163 *cbh = bh; 164 *cbh = bh;
164 return ret; 165 return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
168 * This function along with journal_submit_commit_record 169 * This function along with journal_submit_commit_record
169 * allows to write the commit record asynchronously. 170 * allows to write the commit record asynchronously.
170 */ 171 */
171static int journal_wait_on_commit_record(struct buffer_head *bh) 172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
172{ 174{
173 int ret = 0; 175 int ret = 0;
174 176
177retry:
175 clear_buffer_dirty(bh); 178 clear_buffer_dirty(bh);
176 wait_on_buffer(bh); 179 wait_on_buffer(bh);
180 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 printk(KERN_WARNING
182 "JBD2: wait_on_commit_record: sync failed on %s - "
183 "disabling barriers\n", journal->j_devname);
184 spin_lock(&journal->j_state_lock);
185 journal->j_flags &= ~JBD2_BARRIER;
186 spin_unlock(&journal->j_state_lock);
187
188 lock_buffer(bh);
189 clear_buffer_dirty(bh);
190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync;
192
193 ret = submit_bh(WRITE_SYNC, bh);
194 if (ret) {
195 unlock_buffer(bh);
196 return ret;
197 }
198 goto retry;
199 }
177 200
178 if (unlikely(!buffer_uptodate(bh))) 201 if (unlikely(!buffer_uptodate(bh)))
179 ret = -EIO; 202 ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
332 int flags; 355 int flags;
333 int err; 356 int err;
334 unsigned long long blocknr; 357 unsigned long long blocknr;
358 ktime_t start_time;
359 u64 commit_time;
335 char *tagp = NULL; 360 char *tagp = NULL;
336 journal_header_t *header; 361 journal_header_t *header;
337 journal_block_tag_t *tag = NULL; 362 journal_block_tag_t *tag = NULL;
338 int space_left = 0; 363 int space_left = 0;
339 int first_tag = 0; 364 int first_tag = 0;
340 int tag_flag; 365 int tag_flag;
341 int i; 366 int i, to_free = 0;
342 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
343 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
344 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
458 commit_transaction->t_state = T_FLUSH; 483 commit_transaction->t_state = T_FLUSH;
459 journal->j_committing_transaction = commit_transaction; 484 journal->j_committing_transaction = commit_transaction;
460 journal->j_running_transaction = NULL; 485 journal->j_running_transaction = NULL;
486 start_time = ktime_get();
461 commit_transaction->t_log_start = journal->j_head; 487 commit_transaction->t_log_start = journal->j_head;
462 wake_up(&journal->j_wait_transaction_locked); 488 wake_up(&journal->j_wait_transaction_locked);
463 spin_unlock(&journal->j_state_lock); 489 spin_unlock(&journal->j_state_lock);
@@ -803,7 +829,7 @@ wait_for_iobuf:
803 __jbd2_journal_abort_hard(journal); 829 __jbd2_journal_abort_hard(journal);
804 } 830 }
805 if (!err && !is_journal_aborted(journal)) 831 if (!err && !is_journal_aborted(journal))
806 err = journal_wait_on_commit_record(cbh); 832 err = journal_wait_on_commit_record(journal, cbh);
807 833
808 if (err) 834 if (err)
809 jbd2_journal_abort(journal, err); 835 jbd2_journal_abort(journal, err);
@@ -981,14 +1007,23 @@ restart_loop:
981 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1007 J_ASSERT(commit_transaction == journal->j_committing_transaction);
982 journal->j_commit_sequence = commit_transaction->t_tid; 1008 journal->j_commit_sequence = commit_transaction->t_tid;
983 journal->j_committing_transaction = NULL; 1009 journal->j_committing_transaction = NULL;
984 spin_unlock(&journal->j_state_lock); 1010 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
985 1011
986 if (journal->j_commit_callback) 1012 /*
987 journal->j_commit_callback(journal, commit_transaction); 1013 * weight the commit time higher than the average time so we don't
1014 * react too strongly to vast changes in the commit time
1015 */
1016 if (likely(journal->j_average_commit_time))
1017 journal->j_average_commit_time = (commit_time +
1018 journal->j_average_commit_time*3) / 4;
1019 else
1020 journal->j_average_commit_time = commit_time;
1021 spin_unlock(&journal->j_state_lock);
988 1022
989 if (commit_transaction->t_checkpoint_list == NULL && 1023 if (commit_transaction->t_checkpoint_list == NULL &&
990 commit_transaction->t_checkpoint_io_list == NULL) { 1024 commit_transaction->t_checkpoint_io_list == NULL) {
991 __jbd2_journal_drop_transaction(journal, commit_transaction); 1025 __jbd2_journal_drop_transaction(journal, commit_transaction);
1026 to_free = 1;
992 } else { 1027 } else {
993 if (journal->j_checkpoint_transactions == NULL) { 1028 if (journal->j_checkpoint_transactions == NULL) {
994 journal->j_checkpoint_transactions = commit_transaction; 1029 journal->j_checkpoint_transactions = commit_transaction;
@@ -1007,11 +1042,16 @@ restart_loop:
1007 } 1042 }
1008 spin_unlock(&journal->j_list_lock); 1043 spin_unlock(&journal->j_list_lock);
1009 1044
1045 if (journal->j_commit_callback)
1046 journal->j_commit_callback(journal, commit_transaction);
1047
1010 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1048 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1011 journal->j_devname, journal->j_commit_sequence, 1049 journal->j_devname, commit_transaction->t_tid,
1012 journal->j_tail_sequence); 1050 journal->j_tail_sequence);
1013 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1051 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1014 journal->j_commit_sequence, journal->j_tail_sequence); 1052 journal->j_commit_sequence, journal->j_tail_sequence);
1053 if (to_free)
1054 kfree(commit_transaction);
1015 1055
1016 wake_up(&journal->j_wait_done_commit); 1056 wake_up(&journal->j_wait_done_commit);
1017} 1057}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f6bff9d6f8df..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
40 40
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <asm/page.h> 42#include <asm/page.h>
43#include <asm/div64.h>
43 44
44EXPORT_SYMBOL(jbd2_journal_start); 45EXPORT_SYMBOL(jbd2_journal_start);
45EXPORT_SYMBOL(jbd2_journal_restart); 46EXPORT_SYMBOL(jbd2_journal_restart);
@@ -66,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
66EXPORT_SYMBOL(jbd2_journal_check_used_features); 67EXPORT_SYMBOL(jbd2_journal_check_used_features);
67EXPORT_SYMBOL(jbd2_journal_check_available_features); 68EXPORT_SYMBOL(jbd2_journal_check_available_features);
68EXPORT_SYMBOL(jbd2_journal_set_features); 69EXPORT_SYMBOL(jbd2_journal_set_features);
69EXPORT_SYMBOL(jbd2_journal_create);
70EXPORT_SYMBOL(jbd2_journal_load); 70EXPORT_SYMBOL(jbd2_journal_load);
71EXPORT_SYMBOL(jbd2_journal_destroy); 71EXPORT_SYMBOL(jbd2_journal_destroy);
72EXPORT_SYMBOL(jbd2_journal_abort); 72EXPORT_SYMBOL(jbd2_journal_abort);
@@ -132,8 +132,9 @@ static int kjournald2(void *arg)
132 journal->j_task = current; 132 journal->j_task = current;
133 wake_up(&journal->j_wait_done_commit); 133 wake_up(&journal->j_wait_done_commit);
134 134
135 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", 135 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
136 journal->j_commit_interval / HZ); 136 "commit interval %ld seconds\n", current->pid,
137 journal->j_devname, journal->j_commit_interval / HZ);
137 138
138 /* 139 /*
139 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
@@ -650,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
650 return NULL; 651 return NULL;
651 652
652 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 653 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
654 if (!bh)
655 return NULL;
653 lock_buffer(bh); 656 lock_buffer(bh);
654 memset(bh->b_data, 0, journal->j_blocksize); 657 memset(bh->b_data, 0, journal->j_blocksize);
655 set_buffer_uptodate(bh); 658 set_buffer_uptodate(bh);
@@ -843,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
843 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
844 seq_printf(seq, " %ums logging transaction\n", 847 seq_printf(seq, " %ums logging transaction\n",
845 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %luus average transaction commit time\n",
850 do_div(s->journal->j_average_commit_time, 1000));
846 seq_printf(seq, " %lu handles per transaction\n", 851 seq_printf(seq, " %lu handles per transaction\n",
847 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 852 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
848 seq_printf(seq, " %lu blocks per transaction\n", 853 seq_printf(seq, " %lu blocks per transaction\n",
@@ -980,6 +985,8 @@ static journal_t * journal_init_common (void)
980 spin_lock_init(&journal->j_state_lock); 985 spin_lock_init(&journal->j_state_lock);
981 986
982 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 987 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
988 journal->j_min_batch_time = 0;
989 journal->j_max_batch_time = 15000; /* 15ms */
983 990
984 /* The journal is marked for error until we succeed with recovery! */ 991 /* The journal is marked for error until we succeed with recovery! */
985 journal->j_flags = JBD2_ABORT; 992 journal->j_flags = JBD2_ABORT;
@@ -1035,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1035 1042
1036 /* journal descriptor can store up to n blocks -bzzz */ 1043 /* journal descriptor can store up to n blocks -bzzz */
1037 journal->j_blocksize = blocksize; 1044 journal->j_blocksize = blocksize;
1045 jbd2_stats_proc_init(journal);
1038 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1046 n = journal->j_blocksize / sizeof(journal_block_tag_t);
1039 journal->j_wbufsize = n; 1047 journal->j_wbufsize = n;
1040 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1048 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
1041 if (!journal->j_wbuf) { 1049 if (!journal->j_wbuf) {
1042 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1050 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1043 __func__); 1051 __func__);
1044 kfree(journal); 1052 goto out_err;
1045 journal = NULL;
1046 goto out;
1047 } 1053 }
1048 journal->j_dev = bdev; 1054 journal->j_dev = bdev;
1049 journal->j_fs_dev = fs_dev; 1055 journal->j_fs_dev = fs_dev;
@@ -1053,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1053 p = journal->j_devname; 1059 p = journal->j_devname;
1054 while ((p = strchr(p, '/'))) 1060 while ((p = strchr(p, '/')))
1055 *p = '!'; 1061 *p = '!';
1056 jbd2_stats_proc_init(journal);
1057 1062
1058 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1063 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
1059 J_ASSERT(bh != NULL); 1064 if (!bh) {
1065 printk(KERN_ERR
1066 "%s: Cannot get buffer for journal superblock\n",
1067 __func__);
1068 goto out_err;
1069 }
1060 journal->j_sb_buffer = bh; 1070 journal->j_sb_buffer = bh;
1061 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1071 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1062out: 1072
1063 return journal; 1073 return journal;
1074out_err:
1075 jbd2_stats_proc_exit(journal);
1076 kfree(journal);
1077 return NULL;
1064} 1078}
1065 1079
1066/** 1080/**
@@ -1108,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1108 if (!journal->j_wbuf) { 1122 if (!journal->j_wbuf) {
1109 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1123 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1110 __func__); 1124 __func__);
1111 jbd2_stats_proc_exit(journal); 1125 goto out_err;
1112 kfree(journal);
1113 return NULL;
1114 } 1126 }
1115 1127
1116 err = jbd2_journal_bmap(journal, 0, &blocknr); 1128 err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1118,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1118 if (err) { 1130 if (err) {
1119 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 1131 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
1120 __func__); 1132 __func__);
1121 jbd2_stats_proc_exit(journal); 1133 goto out_err;
1122 kfree(journal);
1123 return NULL;
1124 } 1134 }
1125 1135
1126 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1136 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1127 J_ASSERT(bh != NULL); 1137 if (!bh) {
1138 printk(KERN_ERR
1139 "%s: Cannot get buffer for journal superblock\n",
1140 __func__);
1141 goto out_err;
1142 }
1128 journal->j_sb_buffer = bh; 1143 journal->j_sb_buffer = bh;
1129 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1144 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1130 1145
1131 return journal; 1146 return journal;
1147out_err:
1148 jbd2_stats_proc_exit(journal);
1149 kfree(journal);
1150 return NULL;
1132} 1151}
1133 1152
1134/* 1153/*
@@ -1177,77 +1196,6 @@ static int journal_reset(journal_t *journal)
1177} 1196}
1178 1197
1179/** 1198/**
1180 * int jbd2_journal_create() - Initialise the new journal file
1181 * @journal: Journal to create. This structure must have been initialised
1182 *
1183 * Given a journal_t structure which tells us which disk blocks we can
1184 * use, create a new journal superblock and initialise all of the
1185 * journal fields from scratch.
1186 **/
1187int jbd2_journal_create(journal_t *journal)
1188{
1189 unsigned long long blocknr;
1190 struct buffer_head *bh;
1191 journal_superblock_t *sb;
1192 int i, err;
1193
1194 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
1195 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
1196 journal->j_maxlen);
1197 journal_fail_superblock(journal);
1198 return -EINVAL;
1199 }
1200
1201 if (journal->j_inode == NULL) {
1202 /*
1203 * We don't know what block to start at!
1204 */
1205 printk(KERN_EMERG
1206 "%s: creation of journal on external device!\n",
1207 __func__);
1208 BUG();
1209 }
1210
1211 /* Zero out the entire journal on disk. We cannot afford to
1212 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
1213 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1214 for (i = 0; i < journal->j_maxlen; i++) {
1215 err = jbd2_journal_bmap(journal, i, &blocknr);
1216 if (err)
1217 return err;
1218 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1219 lock_buffer(bh);
1220 memset (bh->b_data, 0, journal->j_blocksize);
1221 BUFFER_TRACE(bh, "marking dirty");
1222 mark_buffer_dirty(bh);
1223 BUFFER_TRACE(bh, "marking uptodate");
1224 set_buffer_uptodate(bh);
1225 unlock_buffer(bh);
1226 __brelse(bh);
1227 }
1228
1229 sync_blockdev(journal->j_dev);
1230 jbd_debug(1, "JBD: journal cleared.\n");
1231
1232 /* OK, fill in the initial static fields in the new superblock */
1233 sb = journal->j_superblock;
1234
1235 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
1236 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1237
1238 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1239 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1240 sb->s_first = cpu_to_be32(1);
1241
1242 journal->j_transaction_sequence = 1;
1243
1244 journal->j_flags &= ~JBD2_ABORT;
1245 journal->j_format_version = 2;
1246
1247 return journal_reset(journal);
1248}
1249
1250/**
1251 * void jbd2_journal_update_superblock() - Update journal sb on disk. 1199 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1252 * @journal: The journal to update. 1200 * @journal: The journal to update.
1253 * @wait: Set to '0' if you don't want to wait for IO completion. 1201 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1491,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
1491 spin_lock(&journal->j_list_lock); 1439 spin_lock(&journal->j_list_lock);
1492 while (journal->j_checkpoint_transactions != NULL) { 1440 while (journal->j_checkpoint_transactions != NULL) {
1493 spin_unlock(&journal->j_list_lock); 1441 spin_unlock(&journal->j_list_lock);
1442 mutex_lock(&journal->j_checkpoint_mutex);
1494 jbd2_log_do_checkpoint(journal); 1443 jbd2_log_do_checkpoint(journal);
1444 mutex_unlock(&journal->j_checkpoint_mutex);
1495 spin_lock(&journal->j_list_lock); 1445 spin_lock(&journal->j_list_lock);
1496 } 1446 }
1497 1447
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f925a4f3d05..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{ 49{
49 transaction->t_journal = journal; 50 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING; 51 transaction->t_state = T_RUNNING;
52 transaction->t_start_time = ktime_get();
51 transaction->t_tid = journal->j_transaction_sequence++; 53 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 54 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 55 spin_lock_init(&transaction->t_handle_lock);
@@ -1240,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
1240{ 1242{
1241 transaction_t *transaction = handle->h_transaction; 1243 transaction_t *transaction = handle->h_transaction;
1242 journal_t *journal = transaction->t_journal; 1244 journal_t *journal = transaction->t_journal;
1243 int old_handle_count, err; 1245 int err;
1244 pid_t pid; 1246 pid_t pid;
1245 1247
1246 J_ASSERT(journal_current_handle() == handle); 1248 J_ASSERT(journal_current_handle() == handle);
@@ -1263,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
1263 /* 1265 /*
1264 * Implement synchronous transaction batching. If the handle 1266 * Implement synchronous transaction batching. If the handle
1265 * was synchronous, don't force a commit immediately. Let's 1267 * was synchronous, don't force a commit immediately. Let's
1266 * yield and let another thread piggyback onto this transaction. 1268 * yield and let another thread piggyback onto this
1267 * Keep doing that while new threads continue to arrive. 1269 * transaction. Keep doing that while new threads continue to
1268 * It doesn't cost much - we're about to run a commit and sleep 1270 * arrive. It doesn't cost much - we're about to run a commit
1269 * on IO anyway. Speeds up many-threaded, many-dir operations 1271 * and sleep on IO anyway. Speeds up many-threaded, many-dir
1270 * by 30x or more... 1272 * operations by 30x or more...
1273 *
1274 * We try and optimize the sleep time against what the
1275 * underlying disk can do, instead of having a static sleep
1276 * time. This is useful for the case where our storage is so
1277 * fast that it is more optimal to go ahead and force a flush
1278 * and wait for the transaction to be committed than it is to
1279 * wait for an arbitrary amount of time for new writers to
1280 * join the transaction. We achieve this by measuring how
1281 * long it takes to commit a transaction, and compare it with
1282 * how long this transaction has been running, and if run time
1283 * < commit time then we sleep for the delta and commit. This
1284 * greatly helps super fast disks that would see slowdowns as
1285 * more threads started doing fsyncs.
1271 * 1286 *
1272 * But don't do this if this process was the most recent one to 1287 * But don't do this if this process was the most recent one
1273 * perform a synchronous write. We do this to detect the case where a 1288 * to perform a synchronous write. We do this to detect the
1274 * single process is doing a stream of sync writes. No point in waiting 1289 * case where a single process is doing a stream of sync
1275 * for joiners in that case. 1290 * writes. No point in waiting for joiners in that case.
1276 */ 1291 */
1277 pid = current->pid; 1292 pid = current->pid;
1278 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1293 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1294 u64 commit_time, trans_time;
1295
1279 journal->j_last_sync_writer = pid; 1296 journal->j_last_sync_writer = pid;
1280 do { 1297
1281 old_handle_count = transaction->t_handle_count; 1298 spin_lock(&journal->j_state_lock);
1282 schedule_timeout_uninterruptible(1); 1299 commit_time = journal->j_average_commit_time;
1283 } while (old_handle_count != transaction->t_handle_count); 1300 spin_unlock(&journal->j_state_lock);
1301
1302 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1303 transaction->t_start_time));
1304
1305 commit_time = max_t(u64, commit_time,
1306 1000*journal->j_min_batch_time);
1307 commit_time = min_t(u64, commit_time,
1308 1000*journal->j_max_batch_time);
1309
1310 if (trans_time < commit_time) {
1311 ktime_t expires = ktime_add_ns(ktime_get(),
1312 commit_time);
1313 set_current_state(TASK_UNINTERRUPTIBLE);
1314 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1315 }
1284 } 1316 }
1285 1317
1286 current->journal_info = NULL; 1318 current->journal_info = NULL;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8a..170d289ac785 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
22 22
23 23
24#define BIT_DIVIDER_MIPS 1043 24#define BIT_DIVIDER_MIPS 1043
25static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ 25static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
26
27#include <linux/errno.h>
28 26
29struct pushpull { 27struct pushpull {
30 unsigned char *buf; 28 unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
43 int bits[8]; 41 int bits[8];
44}; 42};
45 43
46static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) 44static inline void init_pushpull(struct pushpull *pp, char *buf,
45 unsigned buflen, unsigned ofs,
46 unsigned reserve)
47{ 47{
48 pp->buf = buf; 48 pp->buf = buf;
49 pp->buflen = buflen; 49 pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
53 53
54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) 54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
55{ 55{
56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { 56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
57 return -ENOSPC; 57 return -ENOSPC;
58 }
59 58
60 if (bit) { 59 if (bit)
61 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); 60 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
62 } 61 else
63 else { 62 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
64 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); 63
65 }
66 pp->ofs++; 64 pp->ofs++;
67 65
68 return 0; 66 return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
97 rs->p = (long) (2 * UPPER_BIT_RUBIN); 95 rs->p = (long) (2 * UPPER_BIT_RUBIN);
98 rs->bit_number = (long) 0; 96 rs->bit_number = (long) 0;
99 rs->bit_divider = div; 97 rs->bit_divider = div;
98
100 for (c=0; c<8; c++) 99 for (c=0; c<8; c++)
101 rs->bits[c] = bits[c]; 100 rs->bits[c] = bits[c];
102} 101}
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
108 long i0, i1; 107 long i0, i1;
109 int ret; 108 int ret;
110 109
111 while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { 110 while ((rs->q >= UPPER_BIT_RUBIN) ||
111 ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
112 rs->bit_number++; 112 rs->bit_number++;
113 113
114 ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); 114 ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
119 rs->p <<= 1; 119 rs->p <<= 1;
120 } 120 }
121 i0 = A * rs->p / (A + B); 121 i0 = A * rs->p / (A + B);
122 if (i0 <= 0) { 122 if (i0 <= 0)
123 i0 = 1; 123 i0 = 1;
124 } 124
125 if (i0 >= rs->p) { 125 if (i0 >= rs->p)
126 i0 = rs->p - 1; 126 i0 = rs->p - 1;
127 } 127
128 i1 = rs->p - i0; 128 i1 = rs->p - i0;
129 129
130 if (symbol == 0) 130 if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
157 /* behalve lower */ 157 /* behalve lower */
158 rs->rec_q = 0; 158 rs->rec_q = 0;
159 159
160 for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) 160 for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
161 rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
161 ; 162 ;
162} 163}
163 164
164static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) 165static void __do_decode(struct rubin_state *rs, unsigned long p,
166 unsigned long q)
165{ 167{
166 register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; 168 register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
167 unsigned long rec_q; 169 unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
207 __do_decode(rs, p, q); 209 __do_decode(rs, p, q);
208 210
209 i0 = A * rs->p / (A + B); 211 i0 = A * rs->p / (A + B);
210 if (i0 <= 0) { 212 if (i0 <= 0)
211 i0 = 1; 213 i0 = 1;
212 } 214
213 if (i0 >= rs->p) { 215 if (i0 >= rs->p)
214 i0 = rs->p - 1; 216 i0 = rs->p - 1;
215 }
216 217
217 threshold = rs->q + i0; 218 threshold = rs->q + i0;
218 symbol = rs->rec_q >= threshold; 219 symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
234 struct rubin_state rs_copy; 235 struct rubin_state rs_copy;
235 rs_copy = *rs; 236 rs_copy = *rs;
236 237
237 for (i=0;i<8;i++) { 238 for (i=0; i<8; i++) {
238 ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); 239 ret = encode(rs, rs->bit_divider-rs->bits[i],
240 rs->bits[i], byte & 1);
239 if (ret) { 241 if (ret) {
240 /* Failed. Restore old state */ 242 /* Failed. Restore old state */
241 *rs = rs_copy; 243 *rs = rs_copy;
242 return ret; 244 return ret;
243 } 245 }
244 byte=byte>>1; 246 byte >>= 1 ;
245 } 247 }
246 return 0; 248 return 0;
247} 249}
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
251 int i, result = 0, bit_divider = rs->bit_divider; 253 int i, result = 0, bit_divider = rs->bit_divider;
252 254
253 for (i = 0; i < 8; i++) 255 for (i = 0; i < 8; i++)
254 result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; 256 result |= decode(rs, bit_divider - rs->bits[i],
257 rs->bits[i]) << i;
255 258
256 return result; 259 return result;
257} 260}
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
259 262
260 263
261static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, 264static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
262 unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) 265 unsigned char *cpage_out, uint32_t *sourcelen,
266 uint32_t *dstlen)
263 { 267 {
264 int outpos = 0; 268 int outpos = 0;
265 int pos=0; 269 int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
295int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, 299int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
296 uint32_t *sourcelen, uint32_t *dstlen, void *model) 300 uint32_t *sourcelen, uint32_t *dstlen, void *model)
297{ 301{
298 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); 302 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
303 cpage_out, sourcelen, dstlen);
299} 304}
300#endif 305#endif
301static int jffs2_dynrubin_compress(unsigned char *data_in, 306static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
316 return -1; 321 return -1;
317 322
318 memset(histo, 0, 256); 323 memset(histo, 0, 256);
319 for (i=0; i<mysrclen; i++) { 324 for (i=0; i<mysrclen; i++)
320 histo[data_in[i]]++; 325 histo[data_in[i]]++;
321 }
322 memset(bits, 0, sizeof(int)*8); 326 memset(bits, 0, sizeof(int)*8);
323 for (i=0; i<256; i++) { 327 for (i=0; i<256; i++) {
324 if (i&128) 328 if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
346 cpage_out[i] = bits[i]; 350 cpage_out[i] = bits[i];
347 } 351 }
348 352
349 ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); 353 ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
354 &mydstlen);
350 if (ret) 355 if (ret)
351 return ret; 356 return ret;
352 357
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
363 return 0; 368 return 0;
364} 369}
365 370
366static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, 371static void rubin_do_decompress(int bit_divider, int *bits,
367 unsigned char *page_out, uint32_t srclen, uint32_t destlen) 372 unsigned char *cdata_in,
373 unsigned char *page_out, uint32_t srclen,
374 uint32_t destlen)
368{ 375{
369 int outpos = 0; 376 int outpos = 0;
370 struct rubin_state rs; 377 struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
372 init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); 379 init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
373 init_decode(&rs, bit_divider, bits); 380 init_decode(&rs, bit_divider, bits);
374 381
375 while (outpos < destlen) { 382 while (outpos < destlen)
376 page_out[outpos++] = in_byte(&rs); 383 page_out[outpos++] = in_byte(&rs);
377 }
378} 384}
379 385
380 386
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
383 uint32_t sourcelen, uint32_t dstlen, 389 uint32_t sourcelen, uint32_t dstlen,
384 void *model) 390 void *model)
385{ 391{
386 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); 392 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
393 cpage_out, sourcelen, dstlen);
387 return 0; 394 return 0;
388} 395}
389 396
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
398 for (c=0; c<8; c++) 405 for (c=0; c<8; c++)
399 bits[c] = data_in[c]; 406 bits[c] = data_in[c];
400 407
401 rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); 408 rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
409 dstlen);
402 return 0; 410 return 0;
403} 411}
404 412
405static struct jffs2_compressor jffs2_rubinmips_comp = { 413static struct jffs2_compressor jffs2_rubinmips_comp = {
406 .priority = JFFS2_RUBINMIPS_PRIORITY, 414 .priority = JFFS2_RUBINMIPS_PRIORITY,
407 .name = "rubinmips", 415 .name = "rubinmips",
408 .compr = JFFS2_COMPR_DYNRUBIN, 416 .compr = JFFS2_COMPR_DYNRUBIN,
409 .compress = NULL, /*&jffs2_rubinmips_compress,*/ 417 .compress = NULL, /*&jffs2_rubinmips_compress,*/
410 .decompress = &jffs2_rubinmips_decompress, 418 .decompress = &jffs2_rubinmips_decompress,
411#ifdef JFFS2_RUBINMIPS_DISABLED 419#ifdef JFFS2_RUBINMIPS_DISABLED
412 .disabled = 1, 420 .disabled = 1,
413#else 421#else
414 .disabled = 0, 422 .disabled = 0,
415#endif 423#endif
416}; 424};
417 425
418int jffs2_rubinmips_init(void) 426int jffs2_rubinmips_init(void)
419{ 427{
420 return jffs2_register_compressor(&jffs2_rubinmips_comp); 428 return jffs2_register_compressor(&jffs2_rubinmips_comp);
421} 429}
422 430
423void jffs2_rubinmips_exit(void) 431void jffs2_rubinmips_exit(void)
424{ 432{
425 jffs2_unregister_compressor(&jffs2_rubinmips_comp); 433 jffs2_unregister_compressor(&jffs2_rubinmips_comp);
426} 434}
427 435
428static struct jffs2_compressor jffs2_dynrubin_comp = { 436static struct jffs2_compressor jffs2_dynrubin_comp = {
429 .priority = JFFS2_DYNRUBIN_PRIORITY, 437 .priority = JFFS2_DYNRUBIN_PRIORITY,
430 .name = "dynrubin", 438 .name = "dynrubin",
431 .compr = JFFS2_COMPR_RUBINMIPS, 439 .compr = JFFS2_COMPR_RUBINMIPS,
432 .compress = jffs2_dynrubin_compress, 440 .compress = jffs2_dynrubin_compress,
433 .decompress = &jffs2_dynrubin_decompress, 441 .decompress = &jffs2_dynrubin_decompress,
434#ifdef JFFS2_DYNRUBIN_DISABLED 442#ifdef JFFS2_DYNRUBIN_DISABLED
435 .disabled = 1, 443 .disabled = 1,
436#else 444#else
437 .disabled = 0, 445 .disabled = 0,
438#endif 446#endif
439}; 447};
440 448
441int jffs2_dynrubin_init(void) 449int jffs2_dynrubin_init(void)
442{ 450{
443 return jffs2_register_compressor(&jffs2_dynrubin_comp); 451 return jffs2_register_compressor(&jffs2_dynrubin_comp);
444} 452}
445 453
446void jffs2_dynrubin_exit(void) 454void jffs2_dynrubin_exit(void)
447{ 455{
448 jffs2_unregister_compressor(&jffs2_dynrubin_comp); 456 jffs2_unregister_compressor(&jffs2_dynrubin_comp);
449} 457}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910af..c32b4a1ad6cf 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
175{ 175{
176 /* For NAND, if the failure did not occur at the device level for a 176 /* For NAND, if the failure did not occur at the device level for a
177 specific physical page, don't bother updating the bad block table. */ 177 specific physical page, don't bother updating the bad block table. */
178 if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { 178 if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
179 /* We had a device-level failure to erase. Let's see if we've 179 /* We had a device-level failure to erase. Let's see if we've
180 failed too many times. */ 180 failed too many times. */
181 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { 181 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
209 struct erase_priv_struct *priv = (void *)instr->priv; 209 struct erase_priv_struct *priv = (void *)instr->priv;
210 210
211 if(instr->state != MTD_ERASE_DONE) { 211 if(instr->state != MTD_ERASE_DONE) {
212 printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); 212 printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
213 (unsigned long long)instr->addr, instr->state);
213 jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); 214 jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
214 } else { 215 } else {
215 jffs2_erase_succeeded(priv->c, priv->jeb); 216 jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c3..507ed6ec1847 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
366void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); 366void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
367struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); 367struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
368void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); 368void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
369struct rb_node *rb_next(struct rb_node *);
370struct rb_node *rb_prev(struct rb_node *);
371void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
372int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); 369int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
373uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); 370uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
374struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, 371struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481b..b37d1f78b854 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
543 return ret; 543 return ret;
544} 544}
545 545
546static void jfs_write_super_lockfs(struct super_block *sb) 546static int jfs_freeze(struct super_block *sb)
547{ 547{
548 struct jfs_sb_info *sbi = JFS_SBI(sb); 548 struct jfs_sb_info *sbi = JFS_SBI(sb);
549 struct jfs_log *log = sbi->log; 549 struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
553 lmLogShutdown(log); 553 lmLogShutdown(log);
554 updateSuper(sb, FM_CLEAN); 554 updateSuper(sb, FM_CLEAN);
555 } 555 }
556 return 0;
556} 557}
557 558
558static void jfs_unlockfs(struct super_block *sb) 559static int jfs_unfreeze(struct super_block *sb)
559{ 560{
560 struct jfs_sb_info *sbi = JFS_SBI(sb); 561 struct jfs_sb_info *sbi = JFS_SBI(sb);
561 struct jfs_log *log = sbi->log; 562 struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
568 else 569 else
569 txResume(sb); 570 txResume(sb);
570 } 571 }
572 return 0;
571} 573}
572 574
573static int jfs_get_sb(struct file_system_type *fs_type, 575static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
735 .delete_inode = jfs_delete_inode, 737 .delete_inode = jfs_delete_inode,
736 .put_super = jfs_put_super, 738 .put_super = jfs_put_super,
737 .sync_fs = jfs_sync_fs, 739 .sync_fs = jfs_sync_fs,
738 .write_super_lockfs = jfs_write_super_lockfs, 740 .freeze_fs = jfs_freeze,
739 .unlockfs = jfs_unlockfs, 741 .unfreeze_fs = jfs_unfreeze,
740 .statfs = jfs_statfs, 742 .statfs = jfs_statfs,
741 .remount_fs = jfs_remount, 743 .remount_fs = jfs_remount,
742 .show_options = jfs_show_options, 744 .show_options = jfs_show_options,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 54ff4c77aaa3..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3868,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
3868 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3868 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3869 struct ocfs2_extent_rec *rec, *tmprec; 3869 struct ocfs2_extent_rec *rec, *tmprec;
3870 3870
3871 right_el = path_leaf_el(right_path);; 3871 right_el = path_leaf_el(right_path);
3872 if (left_path) 3872 if (left_path)
3873 left_el = path_leaf_el(left_path); 3873 left_el = path_leaf_el(left_path);
3874 3874
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f731ab491795..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1324,7 +1324,7 @@ again:
1324 goto out; 1324 goto out;
1325 } 1325 }
1326 1326
1327 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", 1327 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
1328 lockres->l_name); 1328 lockres->l_name);
1329 1329
1330 /* At this point we've gone inside the dlm and need to 1330 /* At this point we've gone inside the dlm and need to
@@ -2951,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2951 ocfs2_dlm_dump_lksb(&lockres->l_lksb); 2951 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2952 BUG(); 2952 BUG();
2953 } 2953 }
2954 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", 2954 mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
2955 lockres->l_name); 2955 lockres->l_name);
2956 2956
2957 ocfs2_wait_on_busy_lock(lockres); 2957 ocfs2_wait_on_busy_lock(lockres);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e8f795f978aa..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1605,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1605 struct ocfs2_space_resv *sr) 1605 struct ocfs2_space_resv *sr)
1606{ 1606{
1607 struct inode *inode = file->f_path.dentry->d_inode; 1607 struct inode *inode = file->f_path.dentry->d_inode;
1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; 1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1609 1609
1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1611 !ocfs2_writes_unwritten_extents(osb)) 1611 !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5198ada67398..6d720243f5f4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
334 334
335 blk_free_devt(part_devt(part)); 335 blk_free_devt(part_devt(part));
336 rcu_assign_pointer(ptbl->part[partno], NULL); 336 rcu_assign_pointer(ptbl->part[partno], NULL);
337 rcu_assign_pointer(ptbl->last_lookup, NULL);
337 kobject_put(part->holder_dir); 338 kobject_put(part->holder_dir);
338 device_del(part_to_dev(part)); 339 device_del(part_to_dev(part));
339 340
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do { \
41 (vmi)->used = 0; \ 41 (vmi)->used = 0; \
42 (vmi)->largest_chunk = 0; \ 42 (vmi)->largest_chunk = 0; \
43} while(0) 43} while(0)
44
45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
46#endif 44#endif
47 45
48extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, 46extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
74 "LowTotal: %8lu kB\n" 74 "LowTotal: %8lu kB\n"
75 "LowFree: %8lu kB\n" 75 "LowFree: %8lu kB\n"
76#endif 76#endif
77#ifndef CONFIG_MMU
78 "MmapCopy: %8lu kB\n"
79#endif
77 "SwapTotal: %8lu kB\n" 80 "SwapTotal: %8lu kB\n"
78 "SwapFree: %8lu kB\n" 81 "SwapFree: %8lu kB\n"
79 "Dirty: %8lu kB\n" 82 "Dirty: %8lu kB\n"
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
116 K(i.totalram-i.totalhigh), 119 K(i.totalram-i.totalhigh),
117 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
118#endif 121#endif
122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)),
124#endif
119 K(i.totalswap), 125 K(i.totalswap),
120 K(i.freeswap), 126 K(i.freeswap),
121 K(global_page_state(NR_FILE_DIRTY)), 127 K(global_page_state(NR_FILE_DIRTY)),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
33#include "internal.h" 33#include "internal.h"
34 34
35/* 35/*
36 * display a single VMA to a sequenced file 36 * display a single region to a sequenced file
37 */ 37 */
38int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) 38static int nommu_region_show(struct seq_file *m, struct vm_region *region)
39{ 39{
40 unsigned long ino = 0; 40 unsigned long ino = 0;
41 struct file *file; 41 struct file *file;
42 dev_t dev = 0; 42 dev_t dev = 0;
43 int flags, len; 43 int flags, len;
44 44
45 flags = vma->vm_flags; 45 flags = region->vm_flags;
46 file = vma->vm_file; 46 file = region->vm_file;
47 47
48 if (file) { 48 if (file) {
49 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 49 struct inode *inode = region->vm_file->f_path.dentry->d_inode;
50 dev = inode->i_sb->s_dev; 50 dev = inode->i_sb->s_dev;
51 ino = inode->i_ino; 51 ino = inode->i_ino;
52 } 52 }
53 53
54 seq_printf(m, 54 seq_printf(m,
55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
56 vma->vm_start, 56 region->vm_start,
57 vma->vm_end, 57 region->vm_end,
58 flags & VM_READ ? 'r' : '-', 58 flags & VM_READ ? 'r' : '-',
59 flags & VM_WRITE ? 'w' : '-', 59 flags & VM_WRITE ? 'w' : '-',
60 flags & VM_EXEC ? 'x' : '-', 60 flags & VM_EXEC ? 'x' : '-',
61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
62 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 62 ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
63 MAJOR(dev), MINOR(dev), ino, &len); 63 MAJOR(dev), MINOR(dev), ino, &len);
64 64
65 if (file) { 65 if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
75} 75}
76 76
77/* 77/*
78 * display a list of all the VMAs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernals have a single flat list
80 */ 80 */
81static int nommu_vma_list_show(struct seq_file *m, void *v) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
83 struct vm_area_struct *vma; 83 struct rb_node *p = _p;
84 84
85 vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); 85 return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
86 return nommu_vma_show(m, vma);
87} 86}
88 87
89static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) 88static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
90{ 89{
91 struct rb_node *_rb; 90 struct rb_node *p;
92 loff_t pos = *_pos; 91 loff_t pos = *_pos;
93 void *next = NULL;
94 92
95 down_read(&nommu_vma_sem); 93 down_read(&nommu_region_sem);
96 94
97 for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { 95 for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
98 if (pos == 0) { 96 if (pos-- == 0)
99 next = _rb; 97 return p;
100 break; 98 return NULL;
101 }
102 pos--;
103 }
104
105 return next;
106} 99}
107 100
108static void nommu_vma_list_stop(struct seq_file *m, void *v) 101static void nommu_region_list_stop(struct seq_file *m, void *v)
109{ 102{
110 up_read(&nommu_vma_sem); 103 up_read(&nommu_region_sem);
111} 104}
112 105
113static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) 106static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
114{ 107{
115 (*pos)++; 108 (*pos)++;
116 return rb_next((struct rb_node *) v); 109 return rb_next((struct rb_node *) v);
117} 110}
118 111
119static const struct seq_operations proc_nommu_vma_list_seqop = { 112static struct seq_operations proc_nommu_region_list_seqop = {
120 .start = nommu_vma_list_start, 113 .start = nommu_region_list_start,
121 .next = nommu_vma_list_next, 114 .next = nommu_region_list_next,
122 .stop = nommu_vma_list_stop, 115 .stop = nommu_region_list_stop,
123 .show = nommu_vma_list_show 116 .show = nommu_region_list_show
124}; 117};
125 118
126static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) 119static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
127{ 120{
128 return seq_open(file, &proc_nommu_vma_list_seqop); 121 return seq_open(file, &proc_nommu_region_list_seqop);
129} 122}
130 123
131static const struct file_operations proc_nommu_vma_list_operations = { 124static const struct file_operations proc_nommu_region_list_operations = {
132 .open = proc_nommu_vma_list_open, 125 .open = proc_nommu_region_list_open,
133 .read = seq_read, 126 .read = seq_read,
134 .llseek = seq_lseek, 127 .llseek = seq_lseek,
135 .release = seq_release, 128 .release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
137 130
138static int __init proc_nommu_init(void) 131static int __init proc_nommu_init(void)
139{ 132{
140 proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); 133 proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
141 return 0; 134 return 0;
142} 135}
143 136
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d4a8be32b902..343ea1216bc8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -15,25 +15,32 @@
15 */ 15 */
16void task_mem(struct seq_file *m, struct mm_struct *mm) 16void task_mem(struct seq_file *m, struct mm_struct *mm)
17{ 17{
18 struct vm_list_struct *vml; 18 struct vm_area_struct *vma;
19 unsigned long bytes = 0, sbytes = 0, slack = 0; 19 struct vm_region *region;
20 struct rb_node *p;
21 unsigned long bytes = 0, sbytes = 0, slack = 0, size;
20 22
21 down_read(&mm->mmap_sem); 23 down_read(&mm->mmap_sem);
22 for (vml = mm->context.vmlist; vml; vml = vml->next) { 24 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
23 if (!vml->vma) 25 vma = rb_entry(p, struct vm_area_struct, vm_rb);
24 continue; 26
27 bytes += kobjsize(vma);
28
29 region = vma->vm_region;
30 if (region) {
31 size = kobjsize(region);
32 size += region->vm_end - region->vm_start;
33 } else {
34 size = vma->vm_end - vma->vm_start;
35 }
25 36
26 bytes += kobjsize(vml);
27 if (atomic_read(&mm->mm_count) > 1 || 37 if (atomic_read(&mm->mm_count) > 1 ||
28 atomic_read(&vml->vma->vm_usage) > 1 38 vma->vm_flags & VM_MAYSHARE) {
29 ) { 39 sbytes += size;
30 sbytes += kobjsize((void *) vml->vma->vm_start);
31 sbytes += kobjsize(vml->vma);
32 } else { 40 } else {
33 bytes += kobjsize((void *) vml->vma->vm_start); 41 bytes += size;
34 bytes += kobjsize(vml->vma); 42 if (region)
35 slack += kobjsize((void *) vml->vma->vm_start) - 43 slack = region->vm_end - vma->vm_end;
36 (vml->vma->vm_end - vml->vma->vm_start);
37 } 44 }
38 } 45 }
39 46
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
70 77
71unsigned long task_vsize(struct mm_struct *mm) 78unsigned long task_vsize(struct mm_struct *mm)
72{ 79{
73 struct vm_list_struct *tbp; 80 struct vm_area_struct *vma;
81 struct rb_node *p;
74 unsigned long vsize = 0; 82 unsigned long vsize = 0;
75 83
76 down_read(&mm->mmap_sem); 84 down_read(&mm->mmap_sem);
77 for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { 85 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
78 if (tbp->vma) 86 vma = rb_entry(p, struct vm_area_struct, vm_rb);
79 vsize += kobjsize((void *) tbp->vma->vm_start); 87 vsize += vma->vm_end - vma->vm_start;
80 } 88 }
81 up_read(&mm->mmap_sem); 89 up_read(&mm->mmap_sem);
82 return vsize; 90 return vsize;
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm)
85int task_statm(struct mm_struct *mm, int *shared, int *text, 93int task_statm(struct mm_struct *mm, int *shared, int *text,
86 int *data, int *resident) 94 int *data, int *resident)
87{ 95{
88 struct vm_list_struct *tbp; 96 struct vm_area_struct *vma;
97 struct vm_region *region;
98 struct rb_node *p;
89 int size = kobjsize(mm); 99 int size = kobjsize(mm);
90 100
91 down_read(&mm->mmap_sem); 101 down_read(&mm->mmap_sem);
92 for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { 102 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
93 size += kobjsize(tbp); 103 vma = rb_entry(p, struct vm_area_struct, vm_rb);
94 if (tbp->vma) { 104 size += kobjsize(vma);
95 size += kobjsize(tbp->vma); 105 region = vma->vm_region;
96 size += kobjsize((void *) tbp->vma->vm_start); 106 if (region) {
107 size += kobjsize(region);
108 size += region->vm_end - region->vm_start;
97 } 109 }
98 } 110 }
99 111
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
105} 117}
106 118
107/* 119/*
120 * display a single VMA to a sequenced file
121 */
122static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
123{
124 unsigned long ino = 0;
125 struct file *file;
126 dev_t dev = 0;
127 int flags, len;
128
129 flags = vma->vm_flags;
130 file = vma->vm_file;
131
132 if (file) {
133 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
134 dev = inode->i_sb->s_dev;
135 ino = inode->i_ino;
136 }
137
138 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
140 vma->vm_start,
141 vma->vm_end,
142 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT,
147 MAJOR(dev), MINOR(dev), ino, &len);
148
149 if (file) {
150 len = 25 + sizeof(void *) * 6 - len;
151 if (len < 1)
152 len = 1;
153 seq_printf(m, "%*c", len, ' ');
154 seq_path(m, &file->f_path, "");
155 }
156
157 seq_putc(m, '\n');
158 return 0;
159}
160
161/*
108 * display mapping lines for a particular process's /proc/pid/maps 162 * display mapping lines for a particular process's /proc/pid/maps
109 */ 163 */
110static int show_map(struct seq_file *m, void *_vml) 164static int show_map(struct seq_file *m, void *_p)
111{ 165{
112 struct vm_list_struct *vml = _vml; 166 struct rb_node *p = _p;
113 167
114 return nommu_vma_show(m, vml->vma); 168 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
115} 169}
116 170
117static void *m_start(struct seq_file *m, loff_t *pos) 171static void *m_start(struct seq_file *m, loff_t *pos)
118{ 172{
119 struct proc_maps_private *priv = m->private; 173 struct proc_maps_private *priv = m->private;
120 struct vm_list_struct *vml;
121 struct mm_struct *mm; 174 struct mm_struct *mm;
175 struct rb_node *p;
122 loff_t n = *pos; 176 loff_t n = *pos;
123 177
124 /* pin the task and mm whilst we play with them */ 178 /* pin the task and mm whilst we play with them */
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
134 } 188 }
135 189
136 /* start from the Nth VMA */ 190 /* start from the Nth VMA */
137 for (vml = mm->context.vmlist; vml; vml = vml->next) 191 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
138 if (n-- == 0) 192 if (n-- == 0)
139 return vml; 193 return p;
140 return NULL; 194 return NULL;
141} 195}
142 196
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml)
152 } 206 }
153} 207}
154 208
155static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) 209static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
156{ 210{
157 struct vm_list_struct *vml = _vml; 211 struct rb_node *p = _p;
158 212
159 (*pos)++; 213 (*pos)++;
160 return vml ? vml->next : NULL; 214 return p ? rb_next(p) : NULL;
161} 215}
162 216
163static const struct seq_operations proc_pid_maps_ops = { 217static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
47 47
48 offset = (unsigned long)(*ppos % PAGE_SIZE); 48 offset = (unsigned long)(*ppos % PAGE_SIZE);
49 pfn = (unsigned long)(*ppos / PAGE_SIZE); 49 pfn = (unsigned long)(*ppos / PAGE_SIZE);
50 if (pfn > saved_max_pfn)
51 return -EINVAL;
52 50
53 do { 51 do {
54 if (count > (PAGE_SIZE - offset)) 52 if (count > (PAGE_SIZE - offset))
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc34611..b9b567a28376 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
262 ret = -ENOMEM; 262 ret = -ENOMEM;
263 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); 263 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
264 if (!pages) 264 if (!pages)
265 goto out; 265 goto out_free;
266 266
267 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); 267 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
268 if (nr != lpages) 268 if (nr != lpages)
269 goto out; /* leave if some pages were missing */ 269 goto out_free_pages; /* leave if some pages were missing */
270 270
271 /* check the pages for physical adjacency */ 271 /* check the pages for physical adjacency */
272 ptr = pages; 272 ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
274 page++; 274 page++;
275 for (loop = lpages; loop > 1; loop--) 275 for (loop = lpages; loop > 1; loop--)
276 if (*ptr++ != page++) 276 if (*ptr++ != page++)
277 goto out; 277 goto out_free_pages;
278 278
279 /* okay - all conditions fulfilled */ 279 /* okay - all conditions fulfilled */
280 ret = (unsigned long) page_address(pages[0]); 280 ret = (unsigned long) page_address(pages[0]);
281 281
282 out: 282out_free_pages:
283 if (pages) { 283 ptr = pages;
284 ptr = pages; 284 for (loop = nr; loop > 0; loop--)
285 for (loop = lpages; loop > 0; loop--) 285 put_page(*ptr++);
286 put_page(*ptr++); 286out_free:
287 kfree(pages); 287 kfree(pages);
288 } 288out:
289
290 return ret; 289 return ret;
291} 290}
292 291
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c55651f1407c..f3c820b75829 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
83 reiserfs_sync_fs(s, 1); 83 reiserfs_sync_fs(s, 1);
84} 84}
85 85
86static void reiserfs_write_super_lockfs(struct super_block *s) 86static int reiserfs_freeze(struct super_block *s)
87{ 87{
88 struct reiserfs_transaction_handle th; 88 struct reiserfs_transaction_handle th;
89 reiserfs_write_lock(s); 89 reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
101 } 101 }
102 s->s_dirt = 0; 102 s->s_dirt = 0;
103 reiserfs_write_unlock(s); 103 reiserfs_write_unlock(s);
104 return 0;
104} 105}
105 106
106static void reiserfs_unlockfs(struct super_block *s) 107static int reiserfs_unfreeze(struct super_block *s)
107{ 108{
108 reiserfs_allow_writes(s); 109 reiserfs_allow_writes(s);
110 return 0;
109} 111}
110 112
111extern const struct in_core_key MAX_IN_CORE_KEY; 113extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
613 .put_super = reiserfs_put_super, 615 .put_super = reiserfs_put_super,
614 .write_super = reiserfs_write_super, 616 .write_super = reiserfs_write_super,
615 .sync_fs = reiserfs_sync_fs, 617 .sync_fs = reiserfs_sync_fs,
616 .write_super_lockfs = reiserfs_write_super_lockfs, 618 .freeze_fs = reiserfs_freeze,
617 .unlockfs = reiserfs_unlockfs, 619 .unfreeze_fs = reiserfs_unfreeze,
618 .statfs = reiserfs_statfs, 620 .statfs = reiserfs_statfs,
619 .remount_fs = reiserfs_remount, 621 .remount_fs = reiserfs_remount,
620 .show_options = generic_show_options, 622 .show_options = generic_show_options,
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c97d4c931715..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
490static struct inode * 490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino) 491romfs_iget(struct super_block *sb, unsigned long ino)
492{ 492{
493 int nextfh; 493 int nextfh, ret;
494 struct romfs_inode ri; 494 struct romfs_inode ri;
495 struct inode *i; 495 struct inode *i;
496 496
@@ -526,11 +526,11 @@ romfs_iget(struct super_block *sb, unsigned long ino)
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; 526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527 527
528 /* Precalculate the data offset */ 528 /* Precalculate the data offset */
529 ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); 529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
530 if (ino >= 0) 530 if (ret >= 0)
531 ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); 531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
532 else 532 else
533 ino = 0; 533 ino = 0;
534 534
535 ROMFS_I(i)->i_metasize = ino; 535 ROMFS_I(i)->i_metasize = ino;
536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); 536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 08b91beed806..b0cf1f0896d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -610,7 +610,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
610 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 610 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
611 } 611 }
612 612
613 ret = core_sys_select(n, inp, outp, exp, &end_time); 613 ret = core_sys_select(n, inp, outp, exp, to);
614 ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); 614 ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
615 615
616 if (ret == -ERESTARTNOHAND) { 616 if (ret == -ERESTARTNOHAND) {
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/splice.h> 23#include <linux/splice.h>
24#include <linux/memcontrol.h>
24#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 000000000000..8258cf9a0317
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
1#
2# Makefile for the linux squashfs routines.
3#
4
5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o
8#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 000000000000..c837dfc2b3c6
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * block.c
22 */
23
24/*
25 * This file implements the low-level routines to read and decompress
26 * datablocks and metadata blocks.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h>
34#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36
37#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h"
40#include "squashfs.h"
41
42/*
43 * Read the metadata block length, this is stored in the first two
44 * bytes of the metadata block.
45 */
46static struct buffer_head *get_block_length(struct super_block *sb,
47 u64 *cur_index, int *offset, int *length)
48{
49 struct squashfs_sb_info *msblk = sb->s_fs_info;
50 struct buffer_head *bh;
51
52 bh = sb_bread(sb, *cur_index);
53 if (bh == NULL)
54 return NULL;
55
56 if (msblk->devblksize - *offset == 1) {
57 *length = (unsigned char) bh->b_data[*offset];
58 put_bh(bh);
59 bh = sb_bread(sb, ++(*cur_index));
60 if (bh == NULL)
61 return NULL;
62 *length |= (unsigned char) bh->b_data[0] << 8;
63 *offset = 1;
64 } else {
65 *length = (unsigned char) bh->b_data[*offset] |
66 (unsigned char) bh->b_data[*offset + 1] << 8;
67 *offset += 2;
68 }
69
70 return bh;
71}
72
73
74/*
75 * Read and decompress a metadata block or datablock. Length is non-zero
76 * if a datablock is being read (the size is stored elsewhere in the
77 * filesystem), otherwise the length is obtained from the first two bytes of
78 * the metadata block. A bit in the length field indicates if the block
79 * is stored uncompressed in the filesystem (usually because compression
80 * generated a larger block - this does occasionally happen with zlib).
81 */
82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
83 int length, u64 *next_index, int srclength)
84{
85 struct squashfs_sb_info *msblk = sb->s_fs_info;
86 struct buffer_head **bh;
87 int offset = index & ((1 << msblk->devblksize_log2) - 1);
88 u64 cur_index = index >> msblk->devblksize_log2;
89 int bytes, compressed, b = 0, k = 0, page = 0, avail;
90
91
92 bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
93 sizeof(*bh), GFP_KERNEL);
94 if (bh == NULL)
95 return -ENOMEM;
96
97 if (length) {
98 /*
99 * Datablock.
100 */
101 bytes = -offset;
102 compressed = SQUASHFS_COMPRESSED_BLOCK(length);
103 length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
104 if (next_index)
105 *next_index = index + length;
106
107 TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
108 index, compressed ? "" : "un", length, srclength);
109
110 if (length < 0 || length > srclength ||
111 (index + length) > msblk->bytes_used)
112 goto read_failure;
113
114 for (b = 0; bytes < length; b++, cur_index++) {
115 bh[b] = sb_getblk(sb, cur_index);
116 if (bh[b] == NULL)
117 goto block_release;
118 bytes += msblk->devblksize;
119 }
120 ll_rw_block(READ, b, bh);
121 } else {
122 /*
123 * Metadata block.
124 */
125 if ((index + 2) > msblk->bytes_used)
126 goto read_failure;
127
128 bh[0] = get_block_length(sb, &cur_index, &offset, &length);
129 if (bh[0] == NULL)
130 goto read_failure;
131 b = 1;
132
133 bytes = msblk->devblksize - offset;
134 compressed = SQUASHFS_COMPRESSED(length);
135 length = SQUASHFS_COMPRESSED_SIZE(length);
136 if (next_index)
137 *next_index = index + length + 2;
138
139 TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
140 compressed ? "" : "un", length);
141
142 if (length < 0 || length > srclength ||
143 (index + length) > msblk->bytes_used)
144 goto block_release;
145
146 for (; bytes < length; b++) {
147 bh[b] = sb_getblk(sb, ++cur_index);
148 if (bh[b] == NULL)
149 goto block_release;
150 bytes += msblk->devblksize;
151 }
152 ll_rw_block(READ, b - 1, bh + 1);
153 }
154
155 if (compressed) {
156 int zlib_err = 0, zlib_init = 0;
157
158 /*
159 * Uncompress block.
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate returned unexpected result"
212 " 0x%x, srclength %d, avail_in %d,"
213 " avail_out %d\n", zlib_err, srclength,
214 msblk->stream.avail_in,
215 msblk->stream.avail_out);
216 goto release_mutex;
217 }
218
219 zlib_err = zlib_inflateEnd(&msblk->stream);
220 if (zlib_err != Z_OK) {
221 ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
222 " srclength %d\n", zlib_err, srclength);
223 goto release_mutex;
224 }
225 length = msblk->stream.total_out;
226 mutex_unlock(&msblk->read_data_mutex);
227 } else {
228 /*
229 * Block is uncompressed.
230 */
231 int i, in, pg_offset = 0;
232
233 for (i = 0; i < b; i++) {
234 wait_on_buffer(bh[i]);
235 if (!buffer_uptodate(bh[i]))
236 goto block_release;
237 }
238
239 for (bytes = length; k < b; k++) {
240 in = min(bytes, msblk->devblksize - offset);
241 bytes -= in;
242 while (in) {
243 if (pg_offset == PAGE_CACHE_SIZE) {
244 page++;
245 pg_offset = 0;
246 }
247 avail = min_t(int, in, PAGE_CACHE_SIZE -
248 pg_offset);
249 memcpy(buffer[page] + pg_offset,
250 bh[k]->b_data + offset, avail);
251 in -= avail;
252 pg_offset += avail;
253 offset += avail;
254 }
255 offset = 0;
256 put_bh(bh[k]);
257 }
258 }
259
260 kfree(bh);
261 return length;
262
263release_mutex:
264 mutex_unlock(&msblk->read_data_mutex);
265
266block_release:
267 for (; k < b; k++)
268 put_bh(bh[k]);
269
270read_failure:
271 ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
272 kfree(bh);
273 return -EIO;
274}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 000000000000..f29eda16d25e
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * cache.c
22 */
23
24/*
25 * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
26 * recently accessed data Squashfs uses two small metadata and fragment caches.
27 *
28 * This file implements a generic cache implementation used for both caches,
29 * plus functions layered ontop of the generic cache implementation to
30 * access the metadata and fragment caches.
31 *
32 * To avoid out of memory and fragmentation isssues with vmalloc the cache
33 * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
34 *
35 * It should be noted that the cache is not used for file datablocks, these
36 * are decompressed and cached in the page-cache in the normal way. The
37 * cache is only used to temporarily cache fragment and metadata blocks
38 * which have been read as as a result of a metadata (i.e. inode or
39 * directory) or fragment access. Because metadata and fragments are packed
40 * together into blocks (to gain greater compression) the read of a particular
41 * piece of metadata or fragment will retrieve other metadata/fragments which
42 * have been packed with it, these because of locality-of-reference may be read
43 * in the near future. Temporarily caching them ensures they are available for
44 * near future access without requiring an additional read and decompress.
45 */
46
47#include <linux/fs.h>
48#include <linux/vfs.h>
49#include <linux/slab.h>
50#include <linux/vmalloc.h>
51#include <linux/sched.h>
52#include <linux/spinlock.h>
53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h>
56
57#include "squashfs_fs.h"
58#include "squashfs_fs_sb.h"
59#include "squashfs_fs_i.h"
60#include "squashfs.h"
61
62/*
63 * Look-up block in cache, and increment usage count. If not in cache, read
64 * and decompress it from disk.
65 */
66struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
67 struct squashfs_cache *cache, u64 block, int length)
68{
69 int i, n;
70 struct squashfs_cache_entry *entry;
71
72 spin_lock(&cache->lock);
73
74 while (1) {
75 for (i = 0; i < cache->entries; i++)
76 if (cache->entry[i].block == block)
77 break;
78
79 if (i == cache->entries) {
80 /*
81 * Block not in cache, if all cache entries are used
82 * go to sleep waiting for one to become available.
83 */
84 if (cache->unused == 0) {
85 cache->num_waiters++;
86 spin_unlock(&cache->lock);
87 wait_event(cache->wait_queue, cache->unused);
88 spin_lock(&cache->lock);
89 cache->num_waiters--;
90 continue;
91 }
92
93 /*
94 * At least one unused cache entry. A simple
95 * round-robin strategy is used to choose the entry to
96 * be evicted from the cache.
97 */
98 i = cache->next_blk;
99 for (n = 0; n < cache->entries; n++) {
100 if (cache->entry[i].refcount == 0)
101 break;
102 i = (i + 1) % cache->entries;
103 }
104
105 cache->next_blk = (i + 1) % cache->entries;
106 entry = &cache->entry[i];
107
108 /*
109 * Initialise choosen cache entry, and fill it in from
110 * disk.
111 */
112 cache->unused--;
113 entry->block = block;
114 entry->refcount = 1;
115 entry->pending = 1;
116 entry->num_waiters = 0;
117 entry->error = 0;
118 spin_unlock(&cache->lock);
119
120 entry->length = squashfs_read_data(sb, entry->data,
121 block, length, &entry->next_index,
122 cache->block_size);
123
124 spin_lock(&cache->lock);
125
126 if (entry->length < 0)
127 entry->error = entry->length;
128
129 entry->pending = 0;
130
131 /*
132 * While filling this entry one or more other processes
133 * have looked it up in the cache, and have slept
134 * waiting for it to become available.
135 */
136 if (entry->num_waiters) {
137 spin_unlock(&cache->lock);
138 wake_up_all(&entry->wait_queue);
139 } else
140 spin_unlock(&cache->lock);
141
142 goto out;
143 }
144
145 /*
146 * Block already in cache. Increment refcount so it doesn't
147 * get reused until we're finished with it, if it was
148 * previously unused there's one less cache entry available
149 * for reuse.
150 */
151 entry = &cache->entry[i];
152 if (entry->refcount == 0)
153 cache->unused--;
154 entry->refcount++;
155
156 /*
157 * If the entry is currently being filled in by another process
158 * go to sleep waiting for it to become available.
159 */
160 if (entry->pending) {
161 entry->num_waiters++;
162 spin_unlock(&cache->lock);
163 wait_event(entry->wait_queue, !entry->pending);
164 } else
165 spin_unlock(&cache->lock);
166
167 goto out;
168 }
169
170out:
171 TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
172 cache->name, i, entry->block, entry->refcount, entry->error);
173
174 if (entry->error)
175 ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
176 block);
177 return entry;
178}
179
180
181/*
182 * Release cache entry, once usage count is zero it can be reused.
183 */
184void squashfs_cache_put(struct squashfs_cache_entry *entry)
185{
186 struct squashfs_cache *cache = entry->cache;
187
188 spin_lock(&cache->lock);
189 entry->refcount--;
190 if (entry->refcount == 0) {
191 cache->unused++;
192 /*
193 * If there's any processes waiting for a block to become
194 * available, wake one up.
195 */
196 if (cache->num_waiters) {
197 spin_unlock(&cache->lock);
198 wake_up(&cache->wait_queue);
199 return;
200 }
201 }
202 spin_unlock(&cache->lock);
203}
204
205/*
206 * Delete cache reclaiming all kmalloced buffers.
207 */
208void squashfs_cache_delete(struct squashfs_cache *cache)
209{
210 int i, j;
211
212 if (cache == NULL)
213 return;
214
215 for (i = 0; i < cache->entries; i++) {
216 if (cache->entry[i].data) {
217 for (j = 0; j < cache->pages; j++)
218 kfree(cache->entry[i].data[j]);
219 kfree(cache->entry[i].data);
220 }
221 }
222
223 kfree(cache->entry);
224 kfree(cache);
225}
226
227
228/*
229 * Initialise cache allocating the specified number of entries, each of
230 * size block_size. To avoid vmalloc fragmentation issues each entry
231 * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
232 */
233struct squashfs_cache *squashfs_cache_init(char *name, int entries,
234 int block_size)
235{
236 int i, j;
237 struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
238
239 if (cache == NULL) {
240 ERROR("Failed to allocate %s cache\n", name);
241 return NULL;
242 }
243
244 cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
245 if (cache->entry == NULL) {
246 ERROR("Failed to allocate %s cache\n", name);
247 goto cleanup;
248 }
249
250 cache->next_blk = 0;
251 cache->unused = entries;
252 cache->entries = entries;
253 cache->block_size = block_size;
254 cache->pages = block_size >> PAGE_CACHE_SHIFT;
255 cache->name = name;
256 cache->num_waiters = 0;
257 spin_lock_init(&cache->lock);
258 init_waitqueue_head(&cache->wait_queue);
259
260 for (i = 0; i < entries; i++) {
261 struct squashfs_cache_entry *entry = &cache->entry[i];
262
263 init_waitqueue_head(&cache->entry[i].wait_queue);
264 entry->cache = cache;
265 entry->block = SQUASHFS_INVALID_BLK;
266 entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
267 if (entry->data == NULL) {
268 ERROR("Failed to allocate %s cache entry\n", name);
269 goto cleanup;
270 }
271
272 for (j = 0; j < cache->pages; j++) {
273 entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
274 if (entry->data[j] == NULL) {
275 ERROR("Failed to allocate %s buffer\n", name);
276 goto cleanup;
277 }
278 }
279 }
280
281 return cache;
282
283cleanup:
284 squashfs_cache_delete(cache);
285 return NULL;
286}
287
288
289/*
290 * Copy upto length bytes from cache entry to buffer starting at offset bytes
291 * into the cache entry. If there's not length bytes then copy the number of
292 * bytes available. In all cases return the number of bytes copied.
293 */
294int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
295 int offset, int length)
296{
297 int remaining = length;
298
299 if (length == 0)
300 return 0;
301 else if (buffer == NULL)
302 return min(length, entry->length - offset);
303
304 while (offset < entry->length) {
305 void *buff = entry->data[offset / PAGE_CACHE_SIZE]
306 + (offset % PAGE_CACHE_SIZE);
307 int bytes = min_t(int, entry->length - offset,
308 PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
309
310 if (bytes >= remaining) {
311 memcpy(buffer, buff, remaining);
312 remaining = 0;
313 break;
314 }
315
316 memcpy(buffer, buff, bytes);
317 buffer += bytes;
318 remaining -= bytes;
319 offset += bytes;
320 }
321
322 return length - remaining;
323}
324
325
326/*
327 * Read length bytes from metadata position <block, offset> (block is the
328 * start of the compressed block on disk, and offset is the offset into
329 * the block once decompressed). Data is packed into consecutive blocks,
330 * and length bytes may require reading more than one block.
331 */
332int squashfs_read_metadata(struct super_block *sb, void *buffer,
333 u64 *block, int *offset, int length)
334{
335 struct squashfs_sb_info *msblk = sb->s_fs_info;
336 int bytes, copied = length;
337 struct squashfs_cache_entry *entry;
338
339 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
340
341 while (length) {
342 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
343 if (entry->error)
344 return entry->error;
345 else if (*offset >= entry->length)
346 return -EIO;
347
348 bytes = squashfs_copy_data(buffer, entry, *offset, length);
349 if (buffer)
350 buffer += bytes;
351 length -= bytes;
352 *offset += bytes;
353
354 if (*offset == entry->length) {
355 *block = entry->next_index;
356 *offset = 0;
357 }
358
359 squashfs_cache_put(entry);
360 }
361
362 return copied;
363}
364
365
366/*
367 * Look-up in the fragmment cache the fragment located at <start_block> in the
368 * filesystem. If necessary read and decompress it from disk.
369 */
370struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
371 u64 start_block, int length)
372{
373 struct squashfs_sb_info *msblk = sb->s_fs_info;
374
375 return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
376 length);
377}
378
379
380/*
381 * Read and decompress the datablock located at <start_block> in the
382 * filesystem. The cache is used here to avoid duplicating locking and
383 * read/decompress code.
384 */
385struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
386 u64 start_block, int length)
387{
388 struct squashfs_sb_info *msblk = sb->s_fs_info;
389
390 return squashfs_cache_get(sb, msblk->read_page, start_block, length);
391}
392
393
394/*
395 * Read a filesystem table (uncompressed sequence of bytes) from disk
396 */
397int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
398 int length)
399{
400 int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
401 int i, res;
402 void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
403 if (data == NULL)
404 return -ENOMEM;
405
406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
407 data[i] = buffer;
408 res = squashfs_read_data(sb, data, block, length |
409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
410 kfree(data);
411 return res;
412}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 000000000000..566b0eaed868
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * dir.c
22 */
23
24/*
25 * This file implements code to read directories from disk.
26 *
27 * See namei.c for a description of directory organisation on disk.
28 */
29
30#include <linux/fs.h>
31#include <linux/vfs.h>
32#include <linux/slab.h>
33#include <linux/zlib.h>
34
35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h"
39
40static const unsigned char squashfs_filetype_table[] = {
41 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
42};
43
44/*
45 * Lookup offset (f_pos) in the directory index, returning the
46 * metadata block containing it.
47 *
48 * If we get an error reading the index then return the part of the index
49 * (if any) we have managed to read - the index isn't essential, just
50 * quicker.
51 */
52static int get_dir_index_using_offset(struct super_block *sb,
53 u64 *next_block, int *next_offset, u64 index_start, int index_offset,
54 int i_count, u64 f_pos)
55{
56 struct squashfs_sb_info *msblk = sb->s_fs_info;
57 int err, i, index, length = 0;
58 struct squashfs_dir_index dir_index;
59
60 TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
61 i_count, f_pos);
62
63 /*
64 * Translate from external f_pos to the internal f_pos. This
65 * is offset by 3 because we invent "." and ".." entries which are
66 * not actually stored in the directory.
67 */
68 if (f_pos < 3)
69 return f_pos;
70 f_pos -= 3;
71
72 for (i = 0; i < i_count; i++) {
73 err = squashfs_read_metadata(sb, &dir_index, &index_start,
74 &index_offset, sizeof(dir_index));
75 if (err < 0)
76 break;
77
78 index = le32_to_cpu(dir_index.index);
79 if (index > f_pos)
80 /*
81 * Found the index we're looking for.
82 */
83 break;
84
85 err = squashfs_read_metadata(sb, NULL, &index_start,
86 &index_offset, le32_to_cpu(dir_index.size) + 1);
87 if (err < 0)
88 break;
89
90 length = index;
91 *next_block = le32_to_cpu(dir_index.start_block) +
92 msblk->directory_table;
93 }
94
95 *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
96
97 /*
98 * Translate back from internal f_pos to external f_pos.
99 */
100 return length + 3;
101}
102
103
104static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
105{
106 struct inode *inode = file->f_dentry->d_inode;
107 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
108 u64 block = squashfs_i(inode)->start + msblk->directory_table;
109 int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
110 type, err;
111 unsigned int inode_number;
112 struct squashfs_dir_header dirh;
113 struct squashfs_dir_entry *dire;
114
115 TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
116
117 dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
118 if (dire == NULL) {
119 ERROR("Failed to allocate squashfs_dir_entry\n");
120 goto finish;
121 }
122
123 /*
124 * Return "." and ".." entries as the first two filenames in the
125 * directory. To maximise compression these two entries are not
126 * stored in the directory, and so we invent them here.
127 *
128 * It also means that the external f_pos is offset by 3 from the
129 * on-disk directory f_pos.
130 */
131 while (file->f_pos < 3) {
132 char *name;
133 int i_ino;
134
135 if (file->f_pos == 0) {
136 name = ".";
137 size = 1;
138 i_ino = inode->i_ino;
139 } else {
140 name = "..";
141 size = 2;
142 i_ino = squashfs_i(inode)->parent;
143 }
144
145 TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
146 dirent, name, size, file->f_pos, i_ino,
147 squashfs_filetype_table[1]);
148
149 if (filldir(dirent, name, size, file->f_pos, i_ino,
150 squashfs_filetype_table[1]) < 0) {
151 TRACE("Filldir returned less than 0\n");
152 goto finish;
153 }
154
155 file->f_pos += size;
156 }
157
158 length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
159 squashfs_i(inode)->dir_idx_start,
160 squashfs_i(inode)->dir_idx_offset,
161 squashfs_i(inode)->dir_idx_cnt,
162 file->f_pos);
163
164 while (length < i_size_read(inode)) {
165 /*
166 * Read directory header
167 */
168 err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
169 &offset, sizeof(dirh));
170 if (err < 0)
171 goto failed_read;
172
173 length += sizeof(dirh);
174
175 dir_count = le32_to_cpu(dirh.count) + 1;
176 while (dir_count--) {
177 /*
178 * Read directory entry.
179 */
180 err = squashfs_read_metadata(inode->i_sb, dire, &block,
181 &offset, sizeof(*dire));
182 if (err < 0)
183 goto failed_read;
184
185 size = le16_to_cpu(dire->size) + 1;
186
187 err = squashfs_read_metadata(inode->i_sb, dire->name,
188 &block, &offset, size);
189 if (err < 0)
190 goto failed_read;
191
192 length += sizeof(*dire) + size;
193
194 if (file->f_pos >= length)
195 continue;
196
197 dire->name[size] = '\0';
198 inode_number = le32_to_cpu(dirh.inode_number) +
199 ((short) le16_to_cpu(dire->inode_number));
200 type = le16_to_cpu(dire->type);
201
202 TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
203 "\n", dirent, dire->name, size,
204 file->f_pos,
205 le32_to_cpu(dirh.start_block),
206 le16_to_cpu(dire->offset),
207 inode_number,
208 squashfs_filetype_table[type]);
209
210 if (filldir(dirent, dire->name, size, file->f_pos,
211 inode_number,
212 squashfs_filetype_table[type]) < 0) {
213 TRACE("Filldir returned less than 0\n");
214 goto finish;
215 }
216
217 file->f_pos = length;
218 }
219 }
220
221finish:
222 kfree(dire);
223 return 0;
224
225failed_read:
226 ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
227 kfree(dire);
228 return 0;
229}
230
231
232const struct file_operations squashfs_dir_ops = {
233 .read = generic_read_dir,
234 .readdir = squashfs_readdir
235};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 000000000000..69e971d5ddc1
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * export.c
22 */
23
24/*
25 * This file implements code to make Squashfs filesystems exportable (NFS etc.)
26 *
27 * The export code uses an inode lookup table to map inode numbers passed in
28 * filehandles to an inode location on disk. This table is stored compressed
29 * into metadata blocks. A second index table is used to locate these. This
30 * second index table for speed of access (and because it is small) is read at
31 * mount time and cached in memory.
32 *
33 * The inode lookup table is used only by the export code, inode disk
34 * locations are directly encoded in directories, enabling direct access
35 * without an intermediate lookup for all operations except the export ops.
36 */
37
38#include <linux/fs.h>
39#include <linux/vfs.h>
40#include <linux/dcache.h>
41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43
44#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h"
47#include "squashfs.h"
48
49/*
50 * Look-up inode number (ino) in table, returning the inode location.
51 */
52static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
53{
54 struct squashfs_sb_info *msblk = sb->s_fs_info;
55 int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
56 int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
57 u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
58 __le64 ino;
59 int err;
60
61 TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
62
63 err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
64 if (err < 0)
65 return err;
66
67 TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
68 (u64) le64_to_cpu(ino));
69
70 return le64_to_cpu(ino);
71}
72
73
74static struct dentry *squashfs_export_iget(struct super_block *sb,
75 unsigned int ino_num)
76{
77 long long ino;
78 struct dentry *dentry = ERR_PTR(-ENOENT);
79
80 TRACE("Entered squashfs_export_iget\n");
81
82 ino = squashfs_inode_lookup(sb, ino_num);
83 if (ino >= 0)
84 dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
85
86 return dentry;
87}
88
89
90static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
91 struct fid *fid, int fh_len, int fh_type)
92{
93 if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
94 || fh_len < 2)
95 return NULL;
96
97 return squashfs_export_iget(sb, fid->i32.ino);
98}
99
100
101static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
102 struct fid *fid, int fh_len, int fh_type)
103{
104 if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
105 return NULL;
106
107 return squashfs_export_iget(sb, fid->i32.parent_ino);
108}
109
110
111static struct dentry *squashfs_get_parent(struct dentry *child)
112{
113 struct inode *inode = child->d_inode;
114 unsigned int parent_ino = squashfs_i(inode)->parent;
115
116 return squashfs_export_iget(inode->i_sb, parent_ino);
117}
118
119
120/*
121 * Read uncompressed inode lookup table indexes off disk into memory
122 */
123__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
124 u64 lookup_table_start, unsigned int inodes)
125{
126 unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
127 __le64 *inode_lookup_table;
128 int err;
129
130 TRACE("In read_inode_lookup_table, length %d\n", length);
131
132 /* Allocate inode lookup table indexes */
133 inode_lookup_table = kmalloc(length, GFP_KERNEL);
134 if (inode_lookup_table == NULL) {
135 ERROR("Failed to allocate inode lookup table\n");
136 return ERR_PTR(-ENOMEM);
137 }
138
139 err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
140 length);
141 if (err < 0) {
142 ERROR("unable to read inode lookup table\n");
143 kfree(inode_lookup_table);
144 return ERR_PTR(err);
145 }
146
147 return inode_lookup_table;
148}
149
150
151const struct export_operations squashfs_export_ops = {
152 .fh_to_dentry = squashfs_fh_to_dentry,
153 .fh_to_parent = squashfs_fh_to_parent,
154 .get_parent = squashfs_get_parent
155};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 000000000000..717767d831df
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * file.c
22 */
23
24/*
25 * This file contains code for handling regular files. A regular file
26 * consists of a sequence of contiguous compressed blocks, and/or a
27 * compressed fragment block (tail-end packed block). The compressed size
28 * of each datablock is stored in a block list contained within the
29 * file inode (itself stored in one or more compressed metadata blocks).
30 *
31 * To speed up access to datablocks when reading 'large' files (256 Mbytes or
32 * larger), the code implements an index cache that caches the mapping from
33 * block index to datablock location on disk.
34 *
35 * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
36 * retaining a simple and space-efficient block list on disk. The cache
37 * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
38 * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
39 * The index cache is designed to be memory efficient, and by default uses
40 * 16 KiB.
41 */
42
43#include <linux/fs.h>
44#include <linux/vfs.h>
45#include <linux/kernel.h>
46#include <linux/slab.h>
47#include <linux/string.h>
48#include <linux/pagemap.h>
49#include <linux/mutex.h>
50#include <linux/zlib.h>
51
52#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h"
54#include "squashfs_fs_i.h"
55#include "squashfs.h"
56
57/*
58 * Locate cache slot in range [offset, index] for specified inode. If
59 * there's more than one return the slot closest to index.
60 */
61static struct meta_index *locate_meta_index(struct inode *inode, int offset,
62 int index)
63{
64 struct meta_index *meta = NULL;
65 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
66 int i;
67
68 mutex_lock(&msblk->meta_index_mutex);
69
70 TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
71
72 if (msblk->meta_index == NULL)
73 goto not_allocated;
74
75 for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
76 if (msblk->meta_index[i].inode_number == inode->i_ino &&
77 msblk->meta_index[i].offset >= offset &&
78 msblk->meta_index[i].offset <= index &&
79 msblk->meta_index[i].locked == 0) {
80 TRACE("locate_meta_index: entry %d, offset %d\n", i,
81 msblk->meta_index[i].offset);
82 meta = &msblk->meta_index[i];
83 offset = meta->offset;
84 }
85 }
86
87 if (meta)
88 meta->locked = 1;
89
90not_allocated:
91 mutex_unlock(&msblk->meta_index_mutex);
92
93 return meta;
94}
95
96
97/*
98 * Find and initialise an empty cache slot for index offset.
99 */
100static struct meta_index *empty_meta_index(struct inode *inode, int offset,
101 int skip)
102{
103 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
104 struct meta_index *meta = NULL;
105 int i;
106
107 mutex_lock(&msblk->meta_index_mutex);
108
109 TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
110
111 if (msblk->meta_index == NULL) {
112 /*
113 * First time cache index has been used, allocate and
114 * initialise. The cache index could be allocated at
115 * mount time but doing it here means it is allocated only
116 * if a 'large' file is read.
117 */
118 msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
119 sizeof(*(msblk->meta_index)), GFP_KERNEL);
120 if (msblk->meta_index == NULL) {
121 ERROR("Failed to allocate meta_index\n");
122 goto failed;
123 }
124 for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
125 msblk->meta_index[i].inode_number = 0;
126 msblk->meta_index[i].locked = 0;
127 }
128 msblk->next_meta_index = 0;
129 }
130
131 for (i = SQUASHFS_META_SLOTS; i &&
132 msblk->meta_index[msblk->next_meta_index].locked; i--)
133 msblk->next_meta_index = (msblk->next_meta_index + 1) %
134 SQUASHFS_META_SLOTS;
135
136 if (i == 0) {
137 TRACE("empty_meta_index: failed!\n");
138 goto failed;
139 }
140
141 TRACE("empty_meta_index: returned meta entry %d, %p\n",
142 msblk->next_meta_index,
143 &msblk->meta_index[msblk->next_meta_index]);
144
145 meta = &msblk->meta_index[msblk->next_meta_index];
146 msblk->next_meta_index = (msblk->next_meta_index + 1) %
147 SQUASHFS_META_SLOTS;
148
149 meta->inode_number = inode->i_ino;
150 meta->offset = offset;
151 meta->skip = skip;
152 meta->entries = 0;
153 meta->locked = 1;
154
155failed:
156 mutex_unlock(&msblk->meta_index_mutex);
157 return meta;
158}
159
160
161static void release_meta_index(struct inode *inode, struct meta_index *meta)
162{
163 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
164 mutex_lock(&msblk->meta_index_mutex);
165 meta->locked = 0;
166 mutex_unlock(&msblk->meta_index_mutex);
167}
168
169
170/*
171 * Read the next n blocks from the block list, starting from
172 * metadata block <start_block, offset>.
173 */
174static long long read_indexes(struct super_block *sb, int n,
175 u64 *start_block, int *offset)
176{
177 int err, i;
178 long long block = 0;
179 __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
180
181 if (blist == NULL) {
182 ERROR("read_indexes: Failed to allocate block_list\n");
183 return -ENOMEM;
184 }
185
186 while (n) {
187 int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
188
189 err = squashfs_read_metadata(sb, blist, start_block,
190 offset, blocks << 2);
191 if (err < 0) {
192 ERROR("read_indexes: reading block [%llx:%x]\n",
193 *start_block, *offset);
194 goto failure;
195 }
196
197 for (i = 0; i < blocks; i++) {
198 int size = le32_to_cpu(blist[i]);
199 block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
200 }
201 n -= blocks;
202 }
203
204 kfree(blist);
205 return block;
206
207failure:
208 kfree(blist);
209 return err;
210}
211
212
213/*
214 * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
215 * can cache one index -> datablock/blocklist-block mapping. We wish
216 * to distribute these over the length of the file, entry[0] maps index x,
217 * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
218 * The larger the file, the greater the skip factor. The skip factor is
219 * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
220 * the number of metadata blocks that need to be read fits into the cache.
221 * If the skip factor is limited in this way then the file will use multiple
222 * slots.
223 */
224static inline int calculate_skip(int blocks)
225{
226 int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
227 * SQUASHFS_META_INDEXES);
228 return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
229}
230
231
232/*
233 * Search and grow the index cache for the specified inode, returning the
234 * on-disk locations of the datablock and block list metadata block
235 * <index_block, index_offset> for index (scaled to nearest cache index).
236 */
237static int fill_meta_index(struct inode *inode, int index,
238 u64 *index_block, int *index_offset, u64 *data_block)
239{
240 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
241 int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
242 int offset = 0;
243 struct meta_index *meta;
244 struct meta_entry *meta_entry;
245 u64 cur_index_block = squashfs_i(inode)->block_list_start;
246 int cur_offset = squashfs_i(inode)->offset;
247 u64 cur_data_block = squashfs_i(inode)->start;
248 int err, i;
249
250 /*
251 * Scale index to cache index (cache slot entry)
252 */
253 index /= SQUASHFS_META_INDEXES * skip;
254
255 while (offset < index) {
256 meta = locate_meta_index(inode, offset + 1, index);
257
258 if (meta == NULL) {
259 meta = empty_meta_index(inode, offset + 1, skip);
260 if (meta == NULL)
261 goto all_done;
262 } else {
263 offset = index < meta->offset + meta->entries ? index :
264 meta->offset + meta->entries - 1;
265 meta_entry = &meta->meta_entry[offset - meta->offset];
266 cur_index_block = meta_entry->index_block +
267 msblk->inode_table;
268 cur_offset = meta_entry->offset;
269 cur_data_block = meta_entry->data_block;
270 TRACE("get_meta_index: offset %d, meta->offset %d, "
271 "meta->entries %d\n", offset, meta->offset,
272 meta->entries);
273 TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
274 " data_block 0x%llx\n", cur_index_block,
275 cur_offset, cur_data_block);
276 }
277
278 /*
279 * If necessary grow cache slot by reading block list. Cache
280 * slot is extended up to index or to the end of the slot, in
281 * which case further slots will be used.
282 */
283 for (i = meta->offset + meta->entries; i <= index &&
284 i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
285 int blocks = skip * SQUASHFS_META_INDEXES;
286 long long res = read_indexes(inode->i_sb, blocks,
287 &cur_index_block, &cur_offset);
288
289 if (res < 0) {
290 if (meta->entries == 0)
291 /*
292 * Don't leave an empty slot on read
293 * error allocated to this inode...
294 */
295 meta->inode_number = 0;
296 err = res;
297 goto failed;
298 }
299
300 cur_data_block += res;
301 meta_entry = &meta->meta_entry[i - meta->offset];
302 meta_entry->index_block = cur_index_block -
303 msblk->inode_table;
304 meta_entry->offset = cur_offset;
305 meta_entry->data_block = cur_data_block;
306 meta->entries++;
307 offset++;
308 }
309
310 TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
311 meta->offset, meta->entries);
312
313 release_meta_index(inode, meta);
314 }
315
316all_done:
317 *index_block = cur_index_block;
318 *index_offset = cur_offset;
319 *data_block = cur_data_block;
320
321 /*
322 * Scale cache index (cache slot entry) to index
323 */
324 return offset * SQUASHFS_META_INDEXES * skip;
325
326failed:
327 release_meta_index(inode, meta);
328 return err;
329}
330
331
332/*
333 * Get the on-disk location and compressed size of the datablock
334 * specified by index. Fill_meta_index() does most of the work.
335 */
336static int read_blocklist(struct inode *inode, int index, u64 *block)
337{
338 u64 start;
339 long long blks;
340 int offset;
341 __le32 size;
342 int res = fill_meta_index(inode, index, &start, &offset, block);
343
344 TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
345 " 0x%x, block 0x%llx\n", res, index, start, offset,
346 *block);
347
348 if (res < 0)
349 return res;
350
351 /*
352 * res contains the index of the mapping returned by fill_meta_index(),
353 * this will likely be less than the desired index (because the
354 * meta_index cache works at a higher granularity). Read any
355 * extra block indexes needed.
356 */
357 if (res < index) {
358 blks = read_indexes(inode->i_sb, index - res, &start, &offset);
359 if (blks < 0)
360 return (int) blks;
361 *block += blks;
362 }
363
364 /*
365 * Read length of block specified by index.
366 */
367 res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
368 sizeof(size));
369 if (res < 0)
370 return res;
371 return le32_to_cpu(size);
372}
373
374
375static int squashfs_readpage(struct file *file, struct page *page)
376{
377 struct inode *inode = page->mapping->host;
378 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
379 int bytes, i, offset = 0, sparse = 0;
380 struct squashfs_cache_entry *buffer = NULL;
381 void *pageaddr;
382
383 int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
384 int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
385 int start_index = page->index & ~mask;
386 int end_index = start_index | mask;
387 int file_end = i_size_read(inode) >> msblk->block_log;
388
389 TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
390 page->index, squashfs_i(inode)->start);
391
392 if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
393 PAGE_CACHE_SHIFT))
394 goto out;
395
396 if (index < file_end || squashfs_i(inode)->fragment_block ==
397 SQUASHFS_INVALID_BLK) {
398 /*
399 * Reading a datablock from disk. Need to read block list
400 * to get location and block size.
401 */
402 u64 block = 0;
403 int bsize = read_blocklist(inode, index, &block);
404 if (bsize < 0)
405 goto error_out;
406
407 if (bsize == 0) { /* hole */
408 bytes = index == file_end ?
409 (i_size_read(inode) & (msblk->block_size - 1)) :
410 msblk->block_size;
411 sparse = 1;
412 } else {
413 /*
414 * Read and decompress datablock.
415 */
416 buffer = squashfs_get_datablock(inode->i_sb,
417 block, bsize);
418 if (buffer->error) {
419 ERROR("Unable to read page, block %llx, size %x"
420 "\n", block, bsize);
421 squashfs_cache_put(buffer);
422 goto error_out;
423 }
424 bytes = buffer->length;
425 }
426 } else {
427 /*
428 * Datablock is stored inside a fragment (tail-end packed
429 * block).
430 */
431 buffer = squashfs_get_fragment(inode->i_sb,
432 squashfs_i(inode)->fragment_block,
433 squashfs_i(inode)->fragment_size);
434
435 if (buffer->error) {
436 ERROR("Unable to read page, block %llx, size %x\n",
437 squashfs_i(inode)->fragment_block,
438 squashfs_i(inode)->fragment_size);
439 squashfs_cache_put(buffer);
440 goto error_out;
441 }
442 bytes = i_size_read(inode) & (msblk->block_size - 1);
443 offset = squashfs_i(inode)->fragment_offset;
444 }
445
446 /*
447 * Loop copying datablock into pages. As the datablock likely covers
448 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
449 * grab the pages from the page cache, except for the page that we've
450 * been called to fill.
451 */
452 for (i = start_index; i <= end_index && bytes > 0; i++,
453 bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
454 struct page *push_page;
455 int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
456
457 TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
458
459 push_page = (i == page->index) ? page :
460 grab_cache_page_nowait(page->mapping, i);
461
462 if (!push_page)
463 continue;
464
465 if (PageUptodate(push_page))
466 goto skip_page;
467
468 pageaddr = kmap_atomic(push_page, KM_USER0);
469 squashfs_copy_data(pageaddr, buffer, offset, avail);
470 memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
471 kunmap_atomic(pageaddr, KM_USER0);
472 flush_dcache_page(push_page);
473 SetPageUptodate(push_page);
474skip_page:
475 unlock_page(push_page);
476 if (i != page->index)
477 page_cache_release(push_page);
478 }
479
480 if (!sparse)
481 squashfs_cache_put(buffer);
482
483 return 0;
484
485error_out:
486 SetPageError(page);
487out:
488 pageaddr = kmap_atomic(page, KM_USER0);
489 memset(pageaddr, 0, PAGE_CACHE_SIZE);
490 kunmap_atomic(pageaddr, KM_USER0);
491 flush_dcache_page(page);
492 if (!PageError(page))
493 SetPageUptodate(page);
494 unlock_page(page);
495
496 return 0;
497}
498
499
500const struct address_space_operations squashfs_aops = {
501 .readpage = squashfs_readpage
502};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 000000000000..b5a2c15bbbc7
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * fragment.c
22 */
23
24/*
25 * This file implements code to handle compressed fragments (tail-end packed
26 * datablocks).
27 *
28 * Regular files contain a fragment index which is mapped to a fragment
29 * location on disk and compressed size using a fragment lookup table.
30 * Like everything in Squashfs this fragment lookup table is itself stored
31 * compressed into metadata blocks. A second index table is used to locate
32 * these. This second index table for speed of access (and because it
33 * is small) is read at mount time and cached in memory.
34 */
35
36#include <linux/fs.h>
37#include <linux/vfs.h>
38#include <linux/slab.h>
39#include <linux/zlib.h>
40
41#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h"
44#include "squashfs.h"
45
46/*
47 * Look-up fragment using the fragment index table. Return the on disk
48 * location of the fragment and its compressed size
49 */
50int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
51 u64 *fragment_block)
52{
53 struct squashfs_sb_info *msblk = sb->s_fs_info;
54 int block = SQUASHFS_FRAGMENT_INDEX(fragment);
55 int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
56 u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
57 struct squashfs_fragment_entry fragment_entry;
58 int size;
59
60 size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
61 &offset, sizeof(fragment_entry));
62 if (size < 0)
63 return size;
64
65 *fragment_block = le64_to_cpu(fragment_entry.start_block);
66 size = le32_to_cpu(fragment_entry.size);
67
68 return size;
69}
70
71
72/*
73 * Read the uncompressed fragment lookup table indexes off disk into memory
74 */
75__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
76 u64 fragment_table_start, unsigned int fragments)
77{
78 unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
79 __le64 *fragment_index;
80 int err;
81
82 /* Allocate fragment lookup table indexes */
83 fragment_index = kmalloc(length, GFP_KERNEL);
84 if (fragment_index == NULL) {
85 ERROR("Failed to allocate fragment index table\n");
86 return ERR_PTR(-ENOMEM);
87 }
88
89 err = squashfs_read_table(sb, fragment_index, fragment_table_start,
90 length);
91 if (err < 0) {
92 ERROR("unable to read fragment index table\n");
93 kfree(fragment_index);
94 return ERR_PTR(err);
95 }
96
97 return fragment_index;
98}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 000000000000..3795b837ba28
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * id.c
22 */
23
24/*
25 * This file implements code to handle uids and gids.
26 *
27 * For space efficiency regular files store uid and gid indexes, which are
28 * converted to 32-bit uids/gids using an id look up table. This table is
29 * stored compressed into metadata blocks. A second index table is used to
30 * locate these. This second index table for speed of access (and because it
31 * is small) is read at mount time and cached in memory.
32 */
33
34#include <linux/fs.h>
35#include <linux/vfs.h>
36#include <linux/slab.h>
37#include <linux/zlib.h>
38
39#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h"
42#include "squashfs.h"
43
44/*
45 * Map uid/gid index into real 32-bit uid/gid using the id look up table
46 */
47int squashfs_get_id(struct super_block *sb, unsigned int index,
48 unsigned int *id)
49{
50 struct squashfs_sb_info *msblk = sb->s_fs_info;
51 int block = SQUASHFS_ID_BLOCK(index);
52 int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
53 u64 start_block = le64_to_cpu(msblk->id_table[block]);
54 __le32 disk_id;
55 int err;
56
57 err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
58 sizeof(disk_id));
59 if (err < 0)
60 return err;
61
62 *id = le32_to_cpu(disk_id);
63 return 0;
64}
65
66
67/*
68 * Read uncompressed id lookup table indexes from disk into memory
69 */
70__le64 *squashfs_read_id_index_table(struct super_block *sb,
71 u64 id_table_start, unsigned short no_ids)
72{
73 unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
74 __le64 *id_table;
75 int err;
76
77 TRACE("In read_id_index_table, length %d\n", length);
78
79 /* Allocate id lookup table indexes */
80 id_table = kmalloc(length, GFP_KERNEL);
81 if (id_table == NULL) {
82 ERROR("Failed to allocate id index table\n");
83 return ERR_PTR(-ENOMEM);
84 }
85
86 err = squashfs_read_table(sb, id_table, id_table_start, length);
87 if (err < 0) {
88 ERROR("unable to read id index table\n");
89 kfree(id_table);
90 return ERR_PTR(err);
91 }
92
93 return id_table;
94}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 000000000000..7a63398bb855
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * inode.c
22 */
23
24/*
25 * This file implements code to create and read inodes from disk.
26 *
27 * Inodes in Squashfs are identified by a 48-bit inode which encodes the
28 * location of the compressed metadata block containing the inode, and the byte
29 * offset into that block where the inode is placed (<block, offset>).
30 *
31 * To maximise compression there are different inodes for each file type
32 * (regular file, directory, device, etc.), the inode contents and length
33 * varying with the type.
34 *
35 * To further maximise compression, two types of regular file inode and
36 * directory inode are defined: inodes optimised for frequently occurring
37 * regular files and directories, and extended types where extra
38 * information has to be stored.
39 */
40
41#include <linux/fs.h>
42#include <linux/vfs.h>
43#include <linux/zlib.h>
44
45#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h"
47#include "squashfs_fs_i.h"
48#include "squashfs.h"
49
50/*
51 * Initialise VFS inode with the base inode information common to all
52 * Squashfs inode types. Sqsh_ino contains the unswapped base inode
53 * off disk.
54 */
55static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
56 struct squashfs_base_inode *sqsh_ino)
57{
58 int err;
59
60 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
61 if (err)
62 return err;
63
64 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
65 if (err)
66 return err;
67
68 inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
69 inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
70 inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
71 inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
72 inode->i_mode = le16_to_cpu(sqsh_ino->mode);
73 inode->i_size = 0;
74
75 return err;
76}
77
78
79struct inode *squashfs_iget(struct super_block *sb, long long ino,
80 unsigned int ino_number)
81{
82 struct inode *inode = iget_locked(sb, ino_number);
83 int err;
84
85 TRACE("Entered squashfs_iget\n");
86
87 if (!inode)
88 return ERR_PTR(-ENOMEM);
89 if (!(inode->i_state & I_NEW))
90 return inode;
91
92 err = squashfs_read_inode(inode, ino);
93 if (err) {
94 iget_failed(inode);
95 return ERR_PTR(err);
96 }
97
98 unlock_new_inode(inode);
99 return inode;
100}
101
102
103/*
104 * Initialise VFS inode by reading inode from inode table (compressed
105 * metadata). The format and amount of data read depends on type.
106 */
107int squashfs_read_inode(struct inode *inode, long long ino)
108{
109 struct super_block *sb = inode->i_sb;
110 struct squashfs_sb_info *msblk = sb->s_fs_info;
111 u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
112 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
113 union squashfs_inode squashfs_ino;
114 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
115
116 TRACE("Entered squashfs_read_inode\n");
117
118 /*
119 * Read inode base common to all inode types.
120 */
121 err = squashfs_read_metadata(sb, sqshb_ino, &block,
122 &offset, sizeof(*sqshb_ino));
123 if (err < 0)
124 goto failed_read;
125
126 err = squashfs_new_inode(sb, inode, sqshb_ino);
127 if (err)
128 goto failed_read;
129
130 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
131 offset = SQUASHFS_INODE_OFFSET(ino);
132
133 type = le16_to_cpu(sqshb_ino->inode_type);
134 switch (type) {
135 case SQUASHFS_REG_TYPE: {
136 unsigned int frag_offset, frag_size, frag;
137 u64 frag_blk;
138 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
139
140 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
141 sizeof(*sqsh_ino));
142 if (err < 0)
143 goto failed_read;
144
145 frag = le32_to_cpu(sqsh_ino->fragment);
146 if (frag != SQUASHFS_INVALID_FRAG) {
147 frag_offset = le32_to_cpu(sqsh_ino->offset);
148 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
149 if (frag_size < 0) {
150 err = frag_size;
151 goto failed_read;
152 }
153 } else {
154 frag_blk = SQUASHFS_INVALID_BLK;
155 frag_size = 0;
156 frag_offset = 0;
157 }
158
159 inode->i_nlink = 1;
160 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
161 inode->i_fop = &generic_ro_fops;
162 inode->i_mode |= S_IFREG;
163 inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
164 squashfs_i(inode)->fragment_block = frag_blk;
165 squashfs_i(inode)->fragment_size = frag_size;
166 squashfs_i(inode)->fragment_offset = frag_offset;
167 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
168 squashfs_i(inode)->block_list_start = block;
169 squashfs_i(inode)->offset = offset;
170 inode->i_data.a_ops = &squashfs_aops;
171
172 TRACE("File inode %x:%x, start_block %llx, block_list_start "
173 "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
174 offset, squashfs_i(inode)->start, block, offset);
175 break;
176 }
177 case SQUASHFS_LREG_TYPE: {
178 unsigned int frag_offset, frag_size, frag;
179 u64 frag_blk;
180 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
181
182 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
183 sizeof(*sqsh_ino));
184 if (err < 0)
185 goto failed_read;
186
187 frag = le32_to_cpu(sqsh_ino->fragment);
188 if (frag != SQUASHFS_INVALID_FRAG) {
189 frag_offset = le32_to_cpu(sqsh_ino->offset);
190 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
191 if (frag_size < 0) {
192 err = frag_size;
193 goto failed_read;
194 }
195 } else {
196 frag_blk = SQUASHFS_INVALID_BLK;
197 frag_size = 0;
198 frag_offset = 0;
199 }
200
201 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
202 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
203 inode->i_fop = &generic_ro_fops;
204 inode->i_mode |= S_IFREG;
205 inode->i_blocks = ((inode->i_size -
206 le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
207
208 squashfs_i(inode)->fragment_block = frag_blk;
209 squashfs_i(inode)->fragment_size = frag_size;
210 squashfs_i(inode)->fragment_offset = frag_offset;
211 squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
212 squashfs_i(inode)->block_list_start = block;
213 squashfs_i(inode)->offset = offset;
214 inode->i_data.a_ops = &squashfs_aops;
215
216 TRACE("File inode %x:%x, start_block %llx, block_list_start "
217 "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
218 offset, squashfs_i(inode)->start, block, offset);
219 break;
220 }
221 case SQUASHFS_DIR_TYPE: {
222 struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
223
224 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
225 sizeof(*sqsh_ino));
226 if (err < 0)
227 goto failed_read;
228
229 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
230 inode->i_size = le16_to_cpu(sqsh_ino->file_size);
231 inode->i_op = &squashfs_dir_inode_ops;
232 inode->i_fop = &squashfs_dir_ops;
233 inode->i_mode |= S_IFDIR;
234 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
235 squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
236 squashfs_i(inode)->dir_idx_cnt = 0;
237 squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
238
239 TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
240 SQUASHFS_INODE_BLK(ino), offset,
241 squashfs_i(inode)->start,
242 le16_to_cpu(sqsh_ino->offset));
243 break;
244 }
245 case SQUASHFS_LDIR_TYPE: {
246 struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
247
248 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
249 sizeof(*sqsh_ino));
250 if (err < 0)
251 goto failed_read;
252
253 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
254 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
255 inode->i_op = &squashfs_dir_inode_ops;
256 inode->i_fop = &squashfs_dir_ops;
257 inode->i_mode |= S_IFDIR;
258 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
259 squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
260 squashfs_i(inode)->dir_idx_start = block;
261 squashfs_i(inode)->dir_idx_offset = offset;
262 squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
263 squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
264
265 TRACE("Long directory inode %x:%x, start_block %llx, offset "
266 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
267 squashfs_i(inode)->start,
268 le16_to_cpu(sqsh_ino->offset));
269 break;
270 }
271 case SQUASHFS_SYMLINK_TYPE:
272 case SQUASHFS_LSYMLINK_TYPE: {
273 struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
274
275 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
276 sizeof(*sqsh_ino));
277 if (err < 0)
278 goto failed_read;
279
280 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
281 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
282 inode->i_op = &page_symlink_inode_operations;
283 inode->i_data.a_ops = &squashfs_symlink_aops;
284 inode->i_mode |= S_IFLNK;
285 squashfs_i(inode)->start = block;
286 squashfs_i(inode)->offset = offset;
287
288 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
289 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
290 block, offset);
291 break;
292 }
293 case SQUASHFS_BLKDEV_TYPE:
294 case SQUASHFS_CHRDEV_TYPE:
295 case SQUASHFS_LBLKDEV_TYPE:
296 case SQUASHFS_LCHRDEV_TYPE: {
297 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
298 unsigned int rdev;
299
300 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
301 sizeof(*sqsh_ino));
302 if (err < 0)
303 goto failed_read;
304
305 if (type == SQUASHFS_CHRDEV_TYPE)
306 inode->i_mode |= S_IFCHR;
307 else
308 inode->i_mode |= S_IFBLK;
309 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
310 rdev = le32_to_cpu(sqsh_ino->rdev);
311 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
312
313 TRACE("Device inode %x:%x, rdev %x\n",
314 SQUASHFS_INODE_BLK(ino), offset, rdev);
315 break;
316 }
317 case SQUASHFS_FIFO_TYPE:
318 case SQUASHFS_SOCKET_TYPE:
319 case SQUASHFS_LFIFO_TYPE:
320 case SQUASHFS_LSOCKET_TYPE: {
321 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
322
323 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
324 sizeof(*sqsh_ino));
325 if (err < 0)
326 goto failed_read;
327
328 if (type == SQUASHFS_FIFO_TYPE)
329 inode->i_mode |= S_IFIFO;
330 else
331 inode->i_mode |= S_IFSOCK;
332 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
333 init_special_inode(inode, inode->i_mode, 0);
334 break;
335 }
336 default:
337 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
338 return -EINVAL;
339 }
340
341 return 0;
342
343failed_read:
344 ERROR("Unable to read inode 0x%llx\n", ino);
345 return err;
346}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 000000000000..9e398653b22b
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * namei.c
22 */
23
24/*
25 * This file implements code to do filename lookup in directories.
26 *
27 * Like inodes, directories are packed into compressed metadata blocks, stored
28 * in a directory table. Directories are accessed using the start address of
29 * the metablock containing the directory and the offset into the
30 * decompressed block (<block, offset>).
31 *
32 * Directories are organised in a slightly complex way, and are not simply
33 * a list of file names. The organisation takes advantage of the
34 * fact that (in most cases) the inodes of the files will be in the same
35 * compressed metadata block, and therefore, can share the start block.
36 * Directories are therefore organised in a two level list, a directory
37 * header containing the shared start block value, and a sequence of directory
38 * entries, each of which share the shared start block. A new directory header
39 * is written once/if the inode start block changes. The directory
40 * header/directory entry list is repeated as many times as necessary.
41 *
42 * Directories are sorted, and can contain a directory index to speed up
43 * file lookup. Directory indexes store one entry per metablock, each entry
44 * storing the index/filename mapping to the first directory header
45 * in each metadata block. Directories are sorted in alphabetical order,
46 * and at lookup the index is scanned linearly looking for the first filename
47 * alphabetically larger than the filename being looked up. At this point the
48 * location of the metadata block the filename is in has been found.
49 * The general idea of the index is ensure only one metadata block needs to be
50 * decompressed to do a lookup irrespective of the length of the directory.
51 * This scheme has the advantage that it doesn't require extra memory overhead
52 * and doesn't require much extra storage on disk.
53 */
54
55#include <linux/fs.h>
56#include <linux/vfs.h>
57#include <linux/slab.h>
58#include <linux/string.h>
59#include <linux/dcache.h>
60#include <linux/zlib.h>
61
62#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h"
64#include "squashfs_fs_i.h"
65#include "squashfs.h"
66
67/*
68 * Lookup name in the directory index, returning the location of the metadata
69 * block containing it, and the directory index this represents.
70 *
71 * If we get an error reading the index then return the part of the index
72 * (if any) we have managed to read - the index isn't essential, just
73 * quicker.
74 */
75static int get_dir_index_using_name(struct super_block *sb,
76 u64 *next_block, int *next_offset, u64 index_start,
77 int index_offset, int i_count, const char *name,
78 int len)
79{
80 struct squashfs_sb_info *msblk = sb->s_fs_info;
81 int i, size, length = 0, err;
82 struct squashfs_dir_index *index;
83 char *str;
84
85 TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
86
87 index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
88 if (index == NULL) {
89 ERROR("Failed to allocate squashfs_dir_index\n");
90 goto out;
91 }
92
93 str = &index->name[SQUASHFS_NAME_LEN + 1];
94 strncpy(str, name, len);
95 str[len] = '\0';
96
97 for (i = 0; i < i_count; i++) {
98 err = squashfs_read_metadata(sb, index, &index_start,
99 &index_offset, sizeof(*index));
100 if (err < 0)
101 break;
102
103
104 size = le32_to_cpu(index->size) + 1;
105
106 err = squashfs_read_metadata(sb, index->name, &index_start,
107 &index_offset, size);
108 if (err < 0)
109 break;
110
111 index->name[size] = '\0';
112
113 if (strcmp(index->name, str) > 0)
114 break;
115
116 length = le32_to_cpu(index->index);
117 *next_block = le32_to_cpu(index->start_block) +
118 msblk->directory_table;
119 }
120
121 *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
122 kfree(index);
123
124out:
125 /*
126 * Return index (f_pos) of the looked up metadata block. Translate
127 * from internal f_pos to external f_pos which is offset by 3 because
128 * we invent "." and ".." entries which are not actually stored in the
129 * directory.
130 */
131 return length + 3;
132}
133
134
135static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
136 struct nameidata *nd)
137{
138 const unsigned char *name = dentry->d_name.name;
139 int len = dentry->d_name.len;
140 struct inode *inode = NULL;
141 struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
142 struct squashfs_dir_header dirh;
143 struct squashfs_dir_entry *dire;
144 u64 block = squashfs_i(dir)->start + msblk->directory_table;
145 int offset = squashfs_i(dir)->offset;
146 int err, length = 0, dir_count, size;
147
148 TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
149
150 dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
151 if (dire == NULL) {
152 ERROR("Failed to allocate squashfs_dir_entry\n");
153 return ERR_PTR(-ENOMEM);
154 }
155
156 if (len > SQUASHFS_NAME_LEN) {
157 err = -ENAMETOOLONG;
158 goto failed;
159 }
160
161 length = get_dir_index_using_name(dir->i_sb, &block, &offset,
162 squashfs_i(dir)->dir_idx_start,
163 squashfs_i(dir)->dir_idx_offset,
164 squashfs_i(dir)->dir_idx_cnt, name, len);
165
166 while (length < i_size_read(dir)) {
167 /*
168 * Read directory header.
169 */
170 err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
171 &offset, sizeof(dirh));
172 if (err < 0)
173 goto read_failure;
174
175 length += sizeof(dirh);
176
177 dir_count = le32_to_cpu(dirh.count) + 1;
178 while (dir_count--) {
179 /*
180 * Read directory entry.
181 */
182 err = squashfs_read_metadata(dir->i_sb, dire, &block,
183 &offset, sizeof(*dire));
184 if (err < 0)
185 goto read_failure;
186
187 size = le16_to_cpu(dire->size) + 1;
188
189 err = squashfs_read_metadata(dir->i_sb, dire->name,
190 &block, &offset, size);
191 if (err < 0)
192 goto read_failure;
193
194 length += sizeof(*dire) + size;
195
196 if (name[0] < dire->name[0])
197 goto exit_lookup;
198
199 if (len == size && !strncmp(name, dire->name, len)) {
200 unsigned int blk, off, ino_num;
201 long long ino;
202 blk = le32_to_cpu(dirh.start_block);
203 off = le16_to_cpu(dire->offset);
204 ino_num = le32_to_cpu(dirh.inode_number) +
205 (short) le16_to_cpu(dire->inode_number);
206 ino = SQUASHFS_MKINODE(blk, off);
207
208 TRACE("calling squashfs_iget for directory "
209 "entry %s, inode %x:%x, %d\n", name,
210 blk, off, ino_num);
211
212 inode = squashfs_iget(dir->i_sb, ino, ino_num);
213 if (IS_ERR(inode)) {
214 err = PTR_ERR(inode);
215 goto failed;
216 }
217
218 goto exit_lookup;
219 }
220 }
221 }
222
223exit_lookup:
224 kfree(dire);
225 if (inode)
226 return d_splice_alias(inode, dentry);
227 d_add(dentry, inode);
228 return ERR_PTR(0);
229
230read_failure:
231 ERROR("Unable to read directory block [%llx:%x]\n",
232 squashfs_i(dir)->start + msblk->directory_table,
233 squashfs_i(dir)->offset);
234failed:
235 kfree(dire);
236 return ERR_PTR(err);
237}
238
239
240const struct inode_operations squashfs_dir_inode_ops = {
241 .lookup = squashfs_lookup
242};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 000000000000..6b2515d027d5
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * squashfs.h
22 */
23
24#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args)
25
26#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args)
27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int);
38
39/* cache.c */
40extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
41extern void squashfs_cache_delete(struct squashfs_cache *);
42extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
43 struct squashfs_cache *, u64, int);
44extern void squashfs_cache_put(struct squashfs_cache_entry *);
45extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
46extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
47 int *, int);
48extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
49 u64, int);
50extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53
54/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int);
57
58/* fragment.c */
59extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
60extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
61 u64, unsigned int);
62
63/* id.c */
64extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
65extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
66 unsigned short);
67
68/* inode.c */
69extern struct inode *squashfs_iget(struct super_block *, long long,
70 unsigned int);
71extern int squashfs_read_inode(struct inode *, long long);
72
73/*
74 * Inodes and files operations
75 */
76
77/* dir.c */
78extern const struct file_operations squashfs_dir_ops;
79
80/* export.c */
81extern const struct export_operations squashfs_export_ops;
82
83/* file.c */
84extern const struct address_space_operations squashfs_aops;
85
86/* namei.c */
87extern const struct inode_operations squashfs_dir_inode_ops;
88
89/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 000000000000..6840da1bf21e
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,381 @@
1#ifndef SQUASHFS_FS
2#define SQUASHFS_FS
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs.h
24 */
25
26#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
27#define SQUASHFS_MAJOR 4
28#define SQUASHFS_MINOR 0
29#define SQUASHFS_MAGIC 0x73717368
30#define SQUASHFS_START 0
31
32/* size of metadata (inode and directory) blocks */
33#define SQUASHFS_METADATA_SIZE 8192
34#define SQUASHFS_METADATA_LOG 13
35
36/* default size of data blocks */
37#define SQUASHFS_FILE_SIZE 131072
38#define SQUASHFS_FILE_LOG 17
39
40#define SQUASHFS_FILE_MAX_SIZE 1048576
41#define SQUASHFS_FILE_MAX_LOG 20
42
43/* Max number of uids and gids */
44#define SQUASHFS_IDS 65536
45
46/* Max length of filename (not 255) */
47#define SQUASHFS_NAME_LEN 256
48
49#define SQUASHFS_INVALID_FRAG (0xffffffffU)
50#define SQUASHFS_INVALID_BLK (-1LL)
51
52/* Filesystem flags */
53#define SQUASHFS_NOI 0
54#define SQUASHFS_NOD 1
55#define SQUASHFS_NOF 3
56#define SQUASHFS_NO_FRAG 4
57#define SQUASHFS_ALWAYS_FRAG 5
58#define SQUASHFS_DUPLICATE 6
59#define SQUASHFS_EXPORT 7
60
61#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1)
62
63#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \
64 SQUASHFS_NOI)
65
66#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \
67 SQUASHFS_NOD)
68
69#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
70 SQUASHFS_NOF)
71
72#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
73 SQUASHFS_NO_FRAG)
74
75#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
76 SQUASHFS_ALWAYS_FRAG)
77
78#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \
79 SQUASHFS_DUPLICATE)
80
81#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \
82 SQUASHFS_EXPORT)
83
84/* Max number of types and file types */
85#define SQUASHFS_DIR_TYPE 1
86#define SQUASHFS_REG_TYPE 2
87#define SQUASHFS_SYMLINK_TYPE 3
88#define SQUASHFS_BLKDEV_TYPE 4
89#define SQUASHFS_CHRDEV_TYPE 5
90#define SQUASHFS_FIFO_TYPE 6
91#define SQUASHFS_SOCKET_TYPE 7
92#define SQUASHFS_LDIR_TYPE 8
93#define SQUASHFS_LREG_TYPE 9
94#define SQUASHFS_LSYMLINK_TYPE 10
95#define SQUASHFS_LBLKDEV_TYPE 11
96#define SQUASHFS_LCHRDEV_TYPE 12
97#define SQUASHFS_LFIFO_TYPE 13
98#define SQUASHFS_LSOCKET_TYPE 14
99
100/* Flag whether block is compressed or uncompressed, bit is set if block is
101 * uncompressed */
102#define SQUASHFS_COMPRESSED_BIT (1 << 15)
103
104#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
105 (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT)
106
107#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT))
108
109#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24)
110
111#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \
112 ~SQUASHFS_COMPRESSED_BIT_BLOCK)
113
114#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
115
116/*
117 * Inode number ops. Inodes consist of a compressed block number, and an
118 * uncompressed offset within that block
119 */
120#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16))
121
122#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff))
123
124#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\
125 << 16) + (B)))
126
127/* Translate between VFS mode and squashfs mode */
128#define SQUASHFS_MODE(A) ((A) & 0xfff)
129
130/* fragment and fragment table defines */
131#define SQUASHFS_FRAGMENT_BYTES(A) \
132 ((A) * sizeof(struct squashfs_fragment_entry))
133
134#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \
135 SQUASHFS_METADATA_SIZE)
136
137#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \
138 SQUASHFS_METADATA_SIZE)
139
140#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \
141 SQUASHFS_METADATA_SIZE - 1) / \
142 SQUASHFS_METADATA_SIZE)
143
144#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\
145 sizeof(u64))
146
147/* inode lookup table defines */
148#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64))
149
150#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \
151 SQUASHFS_METADATA_SIZE)
152
153#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \
154 SQUASHFS_METADATA_SIZE)
155
156#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \
157 SQUASHFS_METADATA_SIZE - 1) / \
158 SQUASHFS_METADATA_SIZE)
159
160#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\
161 sizeof(u64))
162
163/* uid/gid lookup table defines */
164#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int))
165
166#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \
167 SQUASHFS_METADATA_SIZE)
168
169#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \
170 SQUASHFS_METADATA_SIZE)
171
172#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \
173 SQUASHFS_METADATA_SIZE - 1) / \
174 SQUASHFS_METADATA_SIZE)
175
176#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
177 sizeof(u64))
178
179/* cached data constants for filesystem */
180#define SQUASHFS_CACHED_BLKS 8
181
182#define SQUASHFS_MAX_FILE_SIZE_LOG 64
183
184#define SQUASHFS_MAX_FILE_SIZE (1LL << \
185 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
186
187#define SQUASHFS_MARKER_BYTE 0xff
188
189/* meta index cache */
190#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
191#define SQUASHFS_META_ENTRIES 127
192#define SQUASHFS_META_SLOTS 8
193
194struct meta_entry {
195 u64 data_block;
196 unsigned int index_block;
197 unsigned short offset;
198 unsigned short pad;
199};
200
201struct meta_index {
202 unsigned int inode_number;
203 unsigned int offset;
204 unsigned short entries;
205 unsigned short skip;
206 unsigned short locked;
207 unsigned short pad;
208 struct meta_entry meta_entry[SQUASHFS_META_ENTRIES];
209};
210
211
212/*
213 * definitions for structures on disk
214 */
215#define ZLIB_COMPRESSION 1
216
217struct squashfs_super_block {
218 __le32 s_magic;
219 __le32 inodes;
220 __le32 mkfs_time;
221 __le32 block_size;
222 __le32 fragments;
223 __le16 compression;
224 __le16 block_log;
225 __le16 flags;
226 __le16 no_ids;
227 __le16 s_major;
228 __le16 s_minor;
229 __le64 root_inode;
230 __le64 bytes_used;
231 __le64 id_table_start;
232 __le64 xattr_table_start;
233 __le64 inode_table_start;
234 __le64 directory_table_start;
235 __le64 fragment_table_start;
236 __le64 lookup_table_start;
237};
238
239struct squashfs_dir_index {
240 __le32 index;
241 __le32 start_block;
242 __le32 size;
243 unsigned char name[0];
244};
245
246struct squashfs_base_inode {
247 __le16 inode_type;
248 __le16 mode;
249 __le16 uid;
250 __le16 guid;
251 __le32 mtime;
252 __le32 inode_number;
253};
254
255struct squashfs_ipc_inode {
256 __le16 inode_type;
257 __le16 mode;
258 __le16 uid;
259 __le16 guid;
260 __le32 mtime;
261 __le32 inode_number;
262 __le32 nlink;
263};
264
265struct squashfs_dev_inode {
266 __le16 inode_type;
267 __le16 mode;
268 __le16 uid;
269 __le16 guid;
270 __le32 mtime;
271 __le32 inode_number;
272 __le32 nlink;
273 __le32 rdev;
274};
275
276struct squashfs_symlink_inode {
277 __le16 inode_type;
278 __le16 mode;
279 __le16 uid;
280 __le16 guid;
281 __le32 mtime;
282 __le32 inode_number;
283 __le32 nlink;
284 __le32 symlink_size;
285 char symlink[0];
286};
287
288struct squashfs_reg_inode {
289 __le16 inode_type;
290 __le16 mode;
291 __le16 uid;
292 __le16 guid;
293 __le32 mtime;
294 __le32 inode_number;
295 __le32 start_block;
296 __le32 fragment;
297 __le32 offset;
298 __le32 file_size;
299 __le16 block_list[0];
300};
301
302struct squashfs_lreg_inode {
303 __le16 inode_type;
304 __le16 mode;
305 __le16 uid;
306 __le16 guid;
307 __le32 mtime;
308 __le32 inode_number;
309 __le64 start_block;
310 __le64 file_size;
311 __le64 sparse;
312 __le32 nlink;
313 __le32 fragment;
314 __le32 offset;
315 __le32 xattr;
316 __le16 block_list[0];
317};
318
319struct squashfs_dir_inode {
320 __le16 inode_type;
321 __le16 mode;
322 __le16 uid;
323 __le16 guid;
324 __le32 mtime;
325 __le32 inode_number;
326 __le32 start_block;
327 __le32 nlink;
328 __le16 file_size;
329 __le16 offset;
330 __le32 parent_inode;
331};
332
333struct squashfs_ldir_inode {
334 __le16 inode_type;
335 __le16 mode;
336 __le16 uid;
337 __le16 guid;
338 __le32 mtime;
339 __le32 inode_number;
340 __le32 nlink;
341 __le32 file_size;
342 __le32 start_block;
343 __le32 parent_inode;
344 __le16 i_count;
345 __le16 offset;
346 __le32 xattr;
347 struct squashfs_dir_index index[0];
348};
349
350union squashfs_inode {
351 struct squashfs_base_inode base;
352 struct squashfs_dev_inode dev;
353 struct squashfs_symlink_inode symlink;
354 struct squashfs_reg_inode reg;
355 struct squashfs_lreg_inode lreg;
356 struct squashfs_dir_inode dir;
357 struct squashfs_ldir_inode ldir;
358 struct squashfs_ipc_inode ipc;
359};
360
361struct squashfs_dir_entry {
362 __le16 offset;
363 __le16 inode_number;
364 __le16 type;
365 __le16 size;
366 char name[0];
367};
368
369struct squashfs_dir_header {
370 __le32 count;
371 __le32 start_block;
372 __le32 inode_number;
373};
374
375struct squashfs_fragment_entry {
376 __le64 start_block;
377 __le32 size;
378 unsigned int unused;
379};
380
381#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 000000000000..fbfca30c0c68
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
1#ifndef SQUASHFS_FS_I
2#define SQUASHFS_FS_I
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs_i.h
24 */
25
26struct squashfs_inode_info {
27 u64 start;
28 int offset;
29 union {
30 struct {
31 u64 fragment_block;
32 int fragment_size;
33 int fragment_offset;
34 u64 block_list_start;
35 };
36 struct {
37 u64 dir_idx_start;
38 int dir_idx_offset;
39 int dir_idx_cnt;
40 int parent;
41 };
42 };
43 struct inode vfs_inode;
44};
45#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 000000000000..c8c65614dd1c
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
1#ifndef SQUASHFS_FS_SB
2#define SQUASHFS_FS_SB
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs_sb.h
24 */
25
26#include "squashfs_fs.h"
27
28struct squashfs_cache {
29 char *name;
30 int entries;
31 int next_blk;
32 int num_waiters;
33 int unused;
34 int block_size;
35 int pages;
36 spinlock_t lock;
37 wait_queue_head_t wait_queue;
38 struct squashfs_cache_entry *entry;
39};
40
41struct squashfs_cache_entry {
42 u64 block;
43 int length;
44 int refcount;
45 u64 next_index;
46 int pending;
47 int error;
48 int num_waiters;
49 wait_queue_head_t wait_queue;
50 struct squashfs_cache *cache;
51 void **data;
52};
53
54struct squashfs_sb_info {
55 int devblksize;
56 int devblksize_log2;
57 struct squashfs_cache *block_cache;
58 struct squashfs_cache *fragment_cache;
59 struct squashfs_cache *read_page;
60 int next_meta_index;
61 __le64 *id_table;
62 __le64 *fragment_index;
63 unsigned int *fragment_index_2;
64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index;
67 z_stream stream;
68 __le64 *inode_lookup_table;
69 u64 inode_table;
70 u64 directory_table;
71 unsigned int block_size;
72 unsigned short block_log;
73 long long bytes_used;
74 unsigned int inodes;
75};
76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 000000000000..a0466d7467b2
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,440 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * super.c
22 */
23
24/*
25 * This file implements code to read the superblock, read and initialise
26 * in-memory structures at mount time, and all the VFS glue code to register
27 * the filesystem.
28 */
29
30#include <linux/fs.h>
31#include <linux/vfs.h>
32#include <linux/slab.h>
33#include <linux/mutex.h>
34#include <linux/pagemap.h>
35#include <linux/init.h>
36#include <linux/module.h>
37#include <linux/zlib.h>
38
39#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h"
42#include "squashfs.h"
43
44static struct file_system_type squashfs_fs_type;
45static struct super_operations squashfs_super_ops;
46
47static int supported_squashfs_filesystem(short major, short minor, short comp)
48{
49 if (major < SQUASHFS_MAJOR) {
50 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
51 "filesystems are unsupported\n", major, minor);
52 return -EINVAL;
53 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
54 ERROR("Major/Minor mismatch, trying to mount newer "
55 "%d.%d filesystem\n", major, minor);
56 ERROR("Please update your kernel\n");
57 return -EINVAL;
58 }
59
60 if (comp != ZLIB_COMPRESSION)
61 return -EINVAL;
62
63 return 0;
64}
65
66
67static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
68{
69 struct squashfs_sb_info *msblk;
70 struct squashfs_super_block *sblk = NULL;
71 char b[BDEVNAME_SIZE];
72 struct inode *root;
73 long long root_inode;
74 unsigned short flags;
75 unsigned int fragments;
76 u64 lookup_table_start;
77 int err;
78
79 TRACE("Entered squashfs_fill_superblock\n");
80
81 sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
82 if (sb->s_fs_info == NULL) {
83 ERROR("Failed to allocate squashfs_sb_info\n");
84 return -ENOMEM;
85 }
86 msblk = sb->s_fs_info;
87
88 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
89 GFP_KERNEL);
90 if (msblk->stream.workspace == NULL) {
91 ERROR("Failed to allocate zlib workspace\n");
92 goto failure;
93 }
94
95 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
96 if (sblk == NULL) {
97 ERROR("Failed to allocate squashfs_super_block\n");
98 goto failure;
99 }
100
101 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
102 msblk->devblksize_log2 = ffz(~msblk->devblksize);
103
104 mutex_init(&msblk->read_data_mutex);
105 mutex_init(&msblk->meta_index_mutex);
106
107 /*
108 * msblk->bytes_used is checked in squashfs_read_table to ensure reads
109 * are not beyond filesystem end. But as we're using
110 * squashfs_read_table here to read the superblock (including the value
111 * of bytes_used) we need to set it to an initial sensible dummy value
112 */
113 msblk->bytes_used = sizeof(*sblk);
114 err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
115
116 if (err < 0) {
117 ERROR("unable to read squashfs_super_block\n");
118 goto failed_mount;
119 }
120
121 /* Check it is a SQUASHFS superblock */
122 sb->s_magic = le32_to_cpu(sblk->s_magic);
123 if (sb->s_magic != SQUASHFS_MAGIC) {
124 if (!silent)
125 ERROR("Can't find a SQUASHFS superblock on %s\n",
126 bdevname(sb->s_bdev, b));
127 err = -EINVAL;
128 goto failed_mount;
129 }
130
131 /* Check the MAJOR & MINOR versions and compression type */
132 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
133 le16_to_cpu(sblk->s_minor),
134 le16_to_cpu(sblk->compression));
135 if (err < 0)
136 goto failed_mount;
137
138 err = -EINVAL;
139
140 /*
141 * Check if there's xattrs in the filesystem. These are not
142 * supported in this version, so warn that they will be ignored.
143 */
144 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
145 ERROR("Xattrs in filesystem, these will be ignored\n");
146
147 /* Check the filesystem does not extend beyond the end of the
148 block device */
149 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
150 if (msblk->bytes_used < 0 || msblk->bytes_used >
151 i_size_read(sb->s_bdev->bd_inode))
152 goto failed_mount;
153
154 /* Check block size for sanity */
155 msblk->block_size = le32_to_cpu(sblk->block_size);
156 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
157 goto failed_mount;
158
159 msblk->block_log = le16_to_cpu(sblk->block_log);
160 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
161 goto failed_mount;
162
163 /* Check the root inode for sanity */
164 root_inode = le64_to_cpu(sblk->root_inode);
165 if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
166 goto failed_mount;
167
168 msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
169 msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
170 msblk->inodes = le32_to_cpu(sblk->inodes);
171 flags = le16_to_cpu(sblk->flags);
172
173 TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
174 TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
175 ? "un" : "");
176 TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
177 ? "un" : "");
178 TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
179 TRACE("Block size %d\n", msblk->block_size);
180 TRACE("Number of inodes %d\n", msblk->inodes);
181 TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
182 TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
183 TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
184 TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
185 TRACE("sblk->fragment_table_start %llx\n",
186 (u64) le64_to_cpu(sblk->fragment_table_start));
187 TRACE("sblk->id_table_start %llx\n",
188 (u64) le64_to_cpu(sblk->id_table_start));
189
190 sb->s_maxbytes = MAX_LFS_FILESIZE;
191 sb->s_flags |= MS_RDONLY;
192 sb->s_op = &squashfs_super_ops;
193
194 err = -ENOMEM;
195
196 msblk->block_cache = squashfs_cache_init("metadata",
197 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
198 if (msblk->block_cache == NULL)
199 goto failed_mount;
200
201 /* Allocate read_page block */
202 msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
203 if (msblk->read_page == NULL) {
204 ERROR("Failed to allocate read_page block\n");
205 goto failed_mount;
206 }
207
208 /* Allocate and read id index table */
209 msblk->id_table = squashfs_read_id_index_table(sb,
210 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
211 if (IS_ERR(msblk->id_table)) {
212 err = PTR_ERR(msblk->id_table);
213 msblk->id_table = NULL;
214 goto failed_mount;
215 }
216
217 fragments = le32_to_cpu(sblk->fragments);
218 if (fragments == 0)
219 goto allocate_lookup_table;
220
221 msblk->fragment_cache = squashfs_cache_init("fragment",
222 SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
223 if (msblk->fragment_cache == NULL) {
224 err = -ENOMEM;
225 goto failed_mount;
226 }
227
228 /* Allocate and read fragment index table */
229 msblk->fragment_index = squashfs_read_fragment_index_table(sb,
230 le64_to_cpu(sblk->fragment_table_start), fragments);
231 if (IS_ERR(msblk->fragment_index)) {
232 err = PTR_ERR(msblk->fragment_index);
233 msblk->fragment_index = NULL;
234 goto failed_mount;
235 }
236
237allocate_lookup_table:
238 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
239 if (lookup_table_start == SQUASHFS_INVALID_BLK)
240 goto allocate_root;
241
242 /* Allocate and read inode lookup table */
243 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
244 lookup_table_start, msblk->inodes);
245 if (IS_ERR(msblk->inode_lookup_table)) {
246 err = PTR_ERR(msblk->inode_lookup_table);
247 msblk->inode_lookup_table = NULL;
248 goto failed_mount;
249 }
250
251 sb->s_export_op = &squashfs_export_ops;
252
253allocate_root:
254 root = new_inode(sb);
255 if (!root) {
256 err = -ENOMEM;
257 goto failed_mount;
258 }
259
260 err = squashfs_read_inode(root, root_inode);
261 if (err) {
262 iget_failed(root);
263 goto failed_mount;
264 }
265 insert_inode_hash(root);
266
267 sb->s_root = d_alloc_root(root);
268 if (sb->s_root == NULL) {
269 ERROR("Root inode create failed\n");
270 err = -ENOMEM;
271 iput(root);
272 goto failed_mount;
273 }
274
275 TRACE("Leaving squashfs_fill_super\n");
276 kfree(sblk);
277 return 0;
278
279failed_mount:
280 squashfs_cache_delete(msblk->block_cache);
281 squashfs_cache_delete(msblk->fragment_cache);
282 squashfs_cache_delete(msblk->read_page);
283 kfree(msblk->inode_lookup_table);
284 kfree(msblk->fragment_index);
285 kfree(msblk->id_table);
286 kfree(msblk->stream.workspace);
287 kfree(sb->s_fs_info);
288 sb->s_fs_info = NULL;
289 kfree(sblk);
290 return err;
291
292failure:
293 kfree(msblk->stream.workspace);
294 kfree(sb->s_fs_info);
295 sb->s_fs_info = NULL;
296 return -ENOMEM;
297}
298
299
300static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
301{
302 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
303
304 TRACE("Entered squashfs_statfs\n");
305
306 buf->f_type = SQUASHFS_MAGIC;
307 buf->f_bsize = msblk->block_size;
308 buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
309 buf->f_bfree = buf->f_bavail = 0;
310 buf->f_files = msblk->inodes;
311 buf->f_ffree = 0;
312 buf->f_namelen = SQUASHFS_NAME_LEN;
313
314 return 0;
315}
316
317
318static int squashfs_remount(struct super_block *sb, int *flags, char *data)
319{
320 *flags |= MS_RDONLY;
321 return 0;
322}
323
324
325static void squashfs_put_super(struct super_block *sb)
326{
327 if (sb->s_fs_info) {
328 struct squashfs_sb_info *sbi = sb->s_fs_info;
329 squashfs_cache_delete(sbi->block_cache);
330 squashfs_cache_delete(sbi->fragment_cache);
331 squashfs_cache_delete(sbi->read_page);
332 kfree(sbi->id_table);
333 kfree(sbi->fragment_index);
334 kfree(sbi->meta_index);
335 kfree(sbi->stream.workspace);
336 kfree(sb->s_fs_info);
337 sb->s_fs_info = NULL;
338 }
339}
340
341
342static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
343 const char *dev_name, void *data,
344 struct vfsmount *mnt)
345{
346 return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
347 mnt);
348}
349
350
351static struct kmem_cache *squashfs_inode_cachep;
352
353
354static void init_once(void *foo)
355{
356 struct squashfs_inode_info *ei = foo;
357
358 inode_init_once(&ei->vfs_inode);
359}
360
361
362static int __init init_inodecache(void)
363{
364 squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
365 sizeof(struct squashfs_inode_info), 0,
366 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
367
368 return squashfs_inode_cachep ? 0 : -ENOMEM;
369}
370
371
372static void destroy_inodecache(void)
373{
374 kmem_cache_destroy(squashfs_inode_cachep);
375}
376
377
378static int __init init_squashfs_fs(void)
379{
380 int err = init_inodecache();
381
382 if (err)
383 return err;
384
385 err = register_filesystem(&squashfs_fs_type);
386 if (err) {
387 destroy_inodecache();
388 return err;
389 }
390
391 printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
392 "Phillip Lougher\n");
393
394 return 0;
395}
396
397
398static void __exit exit_squashfs_fs(void)
399{
400 unregister_filesystem(&squashfs_fs_type);
401 destroy_inodecache();
402}
403
404
405static struct inode *squashfs_alloc_inode(struct super_block *sb)
406{
407 struct squashfs_inode_info *ei =
408 kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
409
410 return ei ? &ei->vfs_inode : NULL;
411}
412
413
414static void squashfs_destroy_inode(struct inode *inode)
415{
416 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
417}
418
419
420static struct file_system_type squashfs_fs_type = {
421 .owner = THIS_MODULE,
422 .name = "squashfs",
423 .get_sb = squashfs_get_sb,
424 .kill_sb = kill_block_super,
425 .fs_flags = FS_REQUIRES_DEV
426};
427
428static struct super_operations squashfs_super_ops = {
429 .alloc_inode = squashfs_alloc_inode,
430 .destroy_inode = squashfs_destroy_inode,
431 .statfs = squashfs_statfs,
432 .put_super = squashfs_put_super,
433 .remount_fs = squashfs_remount
434};
435
436module_init(init_squashfs_fs);
437module_exit(exit_squashfs_fs);
438MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
439MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
440MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 000000000000..83d87880aac8
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * symlink.c
22 */
23
24/*
25 * This file implements code to handle symbolic links.
26 *
27 * The data contents of symbolic links are stored inside the symbolic
28 * link inode within the inode table. This allows the normally small symbolic
29 * link to be compressed as part of the inode table, achieving much greater
30 * compression than if the symbolic link was compressed individually.
31 */
32
33#include <linux/fs.h>
34#include <linux/vfs.h>
35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h>
38#include <linux/pagemap.h>
39#include <linux/zlib.h>
40
41#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h"
44#include "squashfs.h"
45
46static int squashfs_symlink_readpage(struct file *file, struct page *page)
47{
48 struct inode *inode = page->mapping->host;
49 struct super_block *sb = inode->i_sb;
50 struct squashfs_sb_info *msblk = sb->s_fs_info;
51 int index = page->index << PAGE_CACHE_SHIFT;
52 u64 block = squashfs_i(inode)->start;
53 int offset = squashfs_i(inode)->offset;
54 int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
55 int bytes, copied;
56 void *pageaddr;
57 struct squashfs_cache_entry *entry;
58
59 TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
60 "%llx, offset %x\n", page->index, block, offset);
61
62 /*
63 * Skip index bytes into symlink metadata.
64 */
65 if (index) {
66 bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
67 index);
68 if (bytes < 0) {
69 ERROR("Unable to read symlink [%llx:%x]\n",
70 squashfs_i(inode)->start,
71 squashfs_i(inode)->offset);
72 goto error_out;
73 }
74 }
75
76 /*
77 * Read length bytes from symlink metadata. Squashfs_read_metadata
78 * is not used here because it can sleep and we want to use
79 * kmap_atomic to map the page. Instead call the underlying
80 * squashfs_cache_get routine. As length bytes may overlap metadata
81 * blocks, we may need to call squashfs_cache_get multiple times.
82 */
83 for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
84 entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
85 if (entry->error) {
86 ERROR("Unable to read symlink [%llx:%x]\n",
87 squashfs_i(inode)->start,
88 squashfs_i(inode)->offset);
89 squashfs_cache_put(entry);
90 goto error_out;
91 }
92
93 pageaddr = kmap_atomic(page, KM_USER0);
94 copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
95 length - bytes);
96 if (copied == length - bytes)
97 memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
98 else
99 block = entry->next_index;
100 kunmap_atomic(pageaddr, KM_USER0);
101 squashfs_cache_put(entry);
102 }
103
104 flush_dcache_page(page);
105 SetPageUptodate(page);
106 unlock_page(page);
107 return 0;
108
109error_out:
110 SetPageError(page);
111 unlock_page(page);
112 return 0;
113}
114
115
116const struct address_space_operations squashfs_symlink_aops = {
117 .readpage = squashfs_symlink_readpage
118};
diff --git a/fs/super.c b/fs/super.c
index cb20744ec789..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -458,7 +458,6 @@ void sync_filesystems(int wait)
458 if (sb->s_flags & MS_RDONLY) 458 if (sb->s_flags & MS_RDONLY)
459 continue; 459 continue;
460 sb->s_need_sync_fs = 1; 460 sb->s_need_sync_fs = 1;
461 async_synchronize_full_special(&sb->s_async_list);
462 } 461 }
463 462
464restart: 463restart:
@@ -471,6 +470,7 @@ restart:
471 sb->s_count++; 470 sb->s_count++;
472 spin_unlock(&sb_lock); 471 spin_unlock(&sb_lock);
473 down_read(&sb->s_umount); 472 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list);
474 if (sb->s_root && (wait || sb->s_dirt)) 474 if (sb->s_root && (wait || sb->s_dirt))
475 sb->s_op->sync_fs(sb, wait); 475 sb->s_op->sync_fs(sb, wait);
476 up_read(&sb->s_umount); 476 up_read(&sb->s_umount);
@@ -810,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
810 } 810 }
811 811
812 s->s_flags |= MS_ACTIVE; 812 s->s_flags |= MS_ACTIVE;
813 bdev->bd_super = s;
813 } 814 }
814 815
815 return simple_set_mnt(mnt, s); 816 return simple_set_mnt(mnt, s);
@@ -829,6 +830,7 @@ void kill_block_super(struct super_block *sb)
829 struct block_device *bdev = sb->s_bdev; 830 struct block_device *bdev = sb->s_bdev;
830 fmode_t mode = sb->s_mode; 831 fmode_t mode = sb->s_mode;
831 832
833 bdev->bd_super = 0;
832 generic_shutdown_super(sb); 834 generic_shutdown_super(sb);
833 sync_blockdev(bdev); 835 sync_blockdev(bdev);
834 close_bdev_exclusive(bdev, mode); 836 close_bdev_exclusive(bdev, mode);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 67205f6198ba..e5be1e0be802 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1546,21 +1546,6 @@ xfs_file_ioctl(
1546 return -error; 1546 return -error;
1547 } 1547 }
1548 1548
1549 case XFS_IOC_FREEZE:
1550 if (!capable(CAP_SYS_ADMIN))
1551 return -EPERM;
1552
1553 if (inode->i_sb->s_frozen == SB_UNFROZEN)
1554 freeze_bdev(inode->i_sb->s_bdev);
1555 return 0;
1556
1557 case XFS_IOC_THAW:
1558 if (!capable(CAP_SYS_ADMIN))
1559 return -EPERM;
1560 if (inode->i_sb->s_frozen != SB_UNFROZEN)
1561 thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
1562 return 0;
1563
1564 case XFS_IOC_GOINGDOWN: { 1549 case XFS_IOC_GOINGDOWN: {
1565 __uint32_t in; 1550 __uint32_t in;
1566 1551
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0504cece9f66..50903ad3182e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -632,8 +632,6 @@ xfs_file_compat_ioctl(
632 case XFS_IOC_SET_RESBLKS: 632 case XFS_IOC_SET_RESBLKS:
633 case XFS_IOC_GET_RESBLKS: 633 case XFS_IOC_GET_RESBLKS:
634 case XFS_IOC_FSGROWFSLOG: 634 case XFS_IOC_FSGROWFSLOG:
635 case XFS_IOC_FREEZE:
636 case XFS_IOC_THAW:
637 case XFS_IOC_GOINGDOWN: 635 case XFS_IOC_GOINGDOWN:
638 case XFS_IOC_ERROR_INJECTION: 636 case XFS_IOC_ERROR_INJECTION:
639 case XFS_IOC_ERROR_CLEARALL: 637 case XFS_IOC_ERROR_CLEARALL:
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index be846d606ae8..95a971080368 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1269,14 +1269,14 @@ xfs_fs_remount(
1269 * need to take care of the metadata. Once that's done write a dummy 1269 * need to take care of the metadata. Once that's done write a dummy
1270 * record to dirty the log in case of a crash while frozen. 1270 * record to dirty the log in case of a crash while frozen.
1271 */ 1271 */
1272STATIC void 1272STATIC int
1273xfs_fs_lockfs( 1273xfs_fs_freeze(
1274 struct super_block *sb) 1274 struct super_block *sb)
1275{ 1275{
1276 struct xfs_mount *mp = XFS_M(sb); 1276 struct xfs_mount *mp = XFS_M(sb);
1277 1277
1278 xfs_quiesce_attr(mp); 1278 xfs_quiesce_attr(mp);
1279 xfs_fs_log_dummy(mp); 1279 return -xfs_fs_log_dummy(mp);
1280} 1280}
1281 1281
1282STATIC int 1282STATIC int
@@ -1557,7 +1557,7 @@ static struct super_operations xfs_super_operations = {
1557 .put_super = xfs_fs_put_super, 1557 .put_super = xfs_fs_put_super,
1558 .write_super = xfs_fs_write_super, 1558 .write_super = xfs_fs_write_super,
1559 .sync_fs = xfs_fs_sync_super, 1559 .sync_fs = xfs_fs_sync_super,
1560 .write_super_lockfs = xfs_fs_lockfs, 1560 .freeze_fs = xfs_fs_freeze,
1561 .statfs = xfs_fs_statfs, 1561 .statfs = xfs_fs_statfs,
1562 .remount_fs = xfs_fs_remount, 1562 .remount_fs = xfs_fs_remount,
1563 .show_options = xfs_fs_show_options, 1563 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 589c41c38446..f7c06fac8229 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -465,8 +465,8 @@ typedef struct xfs_handle {
465#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) 465#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
466#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 466#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
467/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 467/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
468#define XFS_IOC_FREEZE _IOWR('X', 119, int) 468/* XFS_IOC_FREEZE -- FIFREEZE 119 */
469#define XFS_IOC_THAW _IOWR('X', 120, int) 469/* XFS_IOC_THAW -- FITHAW 120 */
470#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 470#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
471#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 471#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
472#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) 472#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 852b6d32e8d0..680d0e0ec932 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -595,17 +595,19 @@ out:
595 return 0; 595 return 0;
596} 596}
597 597
598void 598int
599xfs_fs_log_dummy( 599xfs_fs_log_dummy(
600 xfs_mount_t *mp) 600 xfs_mount_t *mp)
601{ 601{
602 xfs_trans_t *tp; 602 xfs_trans_t *tp;
603 xfs_inode_t *ip; 603 xfs_inode_t *ip;
604 int error;
604 605
605 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); 606 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
606 if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { 607 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
608 if (error) {
607 xfs_trans_cancel(tp, 0); 609 xfs_trans_cancel(tp, 0);
608 return; 610 return error;
609 } 611 }
610 612
611 ip = mp->m_rootip; 613 ip = mp->m_rootip;
@@ -615,9 +617,10 @@ xfs_fs_log_dummy(
615 xfs_trans_ihold(tp, ip); 617 xfs_trans_ihold(tp, ip);
616 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 618 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
617 xfs_trans_set_sync(tp); 619 xfs_trans_set_sync(tp);
618 xfs_trans_commit(tp, 0); 620 error = xfs_trans_commit(tp, 0);
619 621
620 xfs_iunlock(ip, XFS_ILOCK_EXCL); 622 xfs_iunlock(ip, XFS_ILOCK_EXCL);
623 return error;
621} 624}
622 625
623int 626int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61ad..88435e0a77c9 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern void xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */